GPU allocation for CPX Unit Tests using PCI bus id (#1403)

* mapping devices wrt pci

* Gpu allocation by using pci mapping

* Passing gpuPriorityOrder in as an argument rather than making the functions non-static.

* Removing redundant testBed instance calling

[ROCm/rccl commit: 69b2b712ab]
This commit is contained in:
saurabhAMD
2024-11-04 10:51:00 -06:00
gecommit door GitHub
bovenliggende ad1384bea1
commit 69d976532b
9 gewijzigde bestanden met toevoegingen van 119 en 19 verwijderingen
@@ -193,7 +193,8 @@ namespace RcclUnitTesting
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
{
@@ -73,7 +73,8 @@ namespace RcclUnitTesting
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
// Prepare AllToAllV options
std::vector<size_t> numInputElements;
@@ -130,7 +131,8 @@ namespace RcclUnitTesting
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
// Prepare AllToAllV options
std::vector<size_t> numInputElements;
+10 -5
Bestand weergeven
@@ -28,7 +28,8 @@ namespace RcclUnitTesting
{
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);
if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall Identical\n", isMultiProcess ? "MP" : "SP", totalRanks);
@@ -84,7 +85,8 @@ namespace RcclUnitTesting
{
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);
if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall Different\n", isMultiProcess ? "MP" : "SP", totalRanks);
@@ -139,7 +141,8 @@ namespace RcclUnitTesting
{
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);
if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall MixedDayaType\n", isMultiProcess ? "MP" : "SP", totalRanks);
@@ -194,7 +197,8 @@ namespace RcclUnitTesting
INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n",
isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup);
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks),
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder),
numCollPerGroup, numStreamsPerGroup);
// Set up each collective in group in different stream (modulo numStreamsPerGroup)
@@ -244,7 +248,8 @@ namespace RcclUnitTesting
int const numProcesses = isMultiProcess ? totalRanks : 1;
// Initialize comms by specifying the # of group calls
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);
if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall MultiGroupCall\n", isMultiProcess ? "MP" : "SP", totalRanks);
@@ -34,7 +34,8 @@ namespace RcclUnitTesting
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
// Initialize communicators in non-blocking mode
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking);
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), 1, 1, 1, useBlocking);
// Loop over various collective functions
for (auto funcType : funcTypes)
+4 -2
Bestand weergeven
@@ -27,7 +27,8 @@ namespace RcclUnitTesting
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
int totalRanks = numGpus * ranksPerGpu;
int const numProcesses = isMultiProcess ? numGpus : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu, gpuPriorityOrder),
{1,2}, //two group, second group sendrecv to self, has 2 coll
testBed.GetNumStreamsPerGroup(1,2),
2);
@@ -119,7 +120,8 @@ namespace RcclUnitTesting
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
int totalRanks = numGpus * ranksPerGpu;
int const numProcesses = isMultiProcess ? numGpus : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu, gpuPriorityOrder),
{1,2}, //two group, second group sendrecv to self, has 2 coll
testBed.GetNumStreamsPerGroup(1,2),
2);
@@ -123,6 +123,73 @@ namespace RcclUnitTesting
return 0;
}
ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
char hexStr[17]; // Longest possible int64 hex string + null terminator.
int hexOffset = 0;
for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
char c = busId[i];
if (c == ':') continue;
if (c == '.') break; //ignore everything after . as they belong to same physical pci
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
hexStr[hexOffset++] = busId[i];
} else break;
}
hexStr[hexOffset] = '\0';
*id = strtol(hexStr, NULL, 16);
return ncclSuccess;
}
int getDevicePriority (std::vector<int> *gpuPriorityOrder){
// Prepare parent->child pipe
int pipefd[2];
if (pipe(pipefd) == -1) {
ERROR("Unable to create parent->child pipe for getting the device priority vector.\n");
return TEST_FAIL;
}
pid_t pid = fork();
if (0 == pid) {
std::vector<int> result;
try {
int numDev;
hipGetDeviceCount(&numDev);
std::unordered_map<int64_t, std::vector<int>> uniqueIdToGpuIndexes;
for(int dev=0;dev<numDev;dev++){
char busIdStr[] = "00000000:00:00.0";
int64_t busId;
hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), dev);
busIdToInt64(busIdStr, &busId);
uniqueIdToGpuIndexes[busId].push_back(dev);
}
std::vector<std::pair<int64_t, std::vector<int>>> sortedIds(uniqueIdToGpuIndexes.begin(), uniqueIdToGpuIndexes.end());
std::sort(sortedIds.begin(), sortedIds.end(), [](const auto& a, const auto& b) {
return a.second.size() > b.second.size();
});
for (const auto& pair : sortedIds) {
result.insert(result.end(), pair.second.begin(), pair.second.end());
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
if (write(pipefd[1], result.data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
close(pipefd[0]);
close(pipefd[1]);
exit(EXIT_SUCCESS);
}
else {
int status;
if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
waitpid(pid, &status, 0);
assert(!status);
close(pipefd[0]);
close(pipefd[1]);
}
return TEST_SUCCESS;
return 0;
}
EnvVars::EnvVars()
{
@@ -151,10 +218,18 @@ namespace RcclUnitTesting
// Total number of reduction ops
int numOps = ncclNumOps;
gpuPriorityOrder.resize(numDetectedGpus);
for(int i=0;i<numDetectedGpus;i++){
gpuPriorityOrder[i] = i;
}
bool isCpxMode = false;
if(isGfx94) {
getDeviceMode(&isCpxMode);
if(isCpxMode) {
getDevicePriority(&gpuPriorityOrder);
}
}
// Test only pow2 number of GPUs for cpx mode to reduce the runtime for UT
onlyPow2Gpus = GetEnvVar("UT_POW2_GPUS" , isCpxMode); // Default value set based on whether system is in CPX mode. UT_POW2_GPUS set by user overrides it.
@@ -235,6 +310,11 @@ namespace RcclUnitTesting
return numGpusList;
}
std::vector<int> const& EnvVars::GetGpuPriorityOrder()
{
return gpuPriorityOrder;
}
std::vector<int> const& EnvVars::GetIsMultiProcessList()
{
return isMultiProcessList;
@@ -41,6 +41,7 @@ namespace RcclUnitTesting
std::vector<int> const& GetNumGpusList();
std::vector<int> const& GetIsMultiProcessList();
std::vector<int> const& GetGpuPriorityOrder(); // Orders the gpus based on the associativity of them with OAM with higher gpus linked.
void ShowConfig();
protected:
@@ -49,6 +50,7 @@ namespace RcclUnitTesting
std::vector<int> numGpusList; // List of # Gpus to use [UT_MIN_GPUS/UT_MAX_GPUS/UT_POW2_GPUS]
std::vector<int> isMultiProcessList; // Single or multi process [UT_PROCESS_MASK]
int numDetectedGpus;
std::vector<int> gpuPriorityOrder; // Orders the gpus based on the associativity of them with OAM with higher gpus linked.
// Helper functions to parse environment variables
int GetEnvVar(std::string const varname, int defaultValue);
+10 -6
Bestand weergeven
@@ -193,7 +193,8 @@ namespace RcclUnitTesting
void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, int const numStreamsPerGroup, int const numGroupCalls, bool const useBlocking)
{
InitComms(TestBed::GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
const std::vector<int>& gpuPriorityOrder = ev.GetGpuPriorityOrder();
InitComms(GetDeviceIdsList(1, numGpus, gpuPriorityOrder), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
}
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
@@ -562,21 +563,23 @@ namespace RcclUnitTesting
}
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
int const numGpus)
int const numGpus,
const std::vector<int>& gpuPriorityOrder)
{
return GetDeviceIdsList(numProcesses, numGpus, 1);
return GetDeviceIdsList(numProcesses, numGpus, 1, gpuPriorityOrder);
}
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
int const numGpus,
int const ranksPerGpu)
int const ranksPerGpu,
const std::vector<int>& gpuPriorityOrder)
{
std::vector<std::vector<int>> result(numProcesses);
int ntasks = numProcesses == 1 ? numGpus : 1;
int k=0;
for (int i = 0; i < numProcesses; i++)
for (int j = 0; j < ntasks * ranksPerGpu; j++) {
result[i].push_back(k%numGpus);
result[i].push_back(gpuPriorityOrder[k%numGpus]);
k++;
}
return result;
@@ -668,7 +671,8 @@ namespace RcclUnitTesting
if(enableSweep == false && (numGpus < 8 || numRanks < 8)) {
continue;
}
this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
const std::vector<int>& gpuPriorityOrder = ev.GetGpuPriorityOrder();
this->InitComms(this->GetDeviceIdsList(numChildren, numGpus, ranksPerGpu, gpuPriorityOrder));
if (testing::Test::HasFailure())
{
isCorrect = false;
@@ -136,9 +136,12 @@ namespace RcclUnitTesting
// Helper function that splits up GPUs to the given number of processes
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus,
int const ranksPerGpu);
int const ranksPerGpu,
const std::vector<int>& gpuPriorityOrder);
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus);
int const numGpus,
const std::vector<int>& gpuPriorityOrder);
// Generate a test case name
static std::string GetTestCaseName(int const totalRanks,