GPU allocation for CPX Unit Tests using PCI bus id (#1403)
* mapping devices wrt pci
* Gpu allocation by using pci mapping
* Passing gpuPriorityOrder in as an argument rather than making the functions non-static.
* Removing redundant testBed instance calling
[ROCm/rccl commit: 69b2b712ab]
This commit is contained in:
gecommit door
GitHub
bovenliggende
ad1384bea1
commit
69d976532b
@@ -193,7 +193,8 @@ namespace RcclUnitTesting
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
{
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
|
||||
|
||||
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
|
||||
{
|
||||
|
||||
@@ -73,7 +73,8 @@ namespace RcclUnitTesting
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
{
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
|
||||
|
||||
// Prepare AllToAllV options
|
||||
std::vector<size_t> numInputElements;
|
||||
@@ -130,7 +131,8 @@ namespace RcclUnitTesting
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
{
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
|
||||
|
||||
// Prepare AllToAllV options
|
||||
std::vector<size_t> numInputElements;
|
||||
|
||||
@@ -28,7 +28,8 @@ namespace RcclUnitTesting
|
||||
{
|
||||
// Test either single process all GPUs, or 1 process per GPU
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);
|
||||
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s %d-ranks GroupCall Identical\n", isMultiProcess ? "MP" : "SP", totalRanks);
|
||||
@@ -84,7 +85,8 @@ namespace RcclUnitTesting
|
||||
{
|
||||
// Test either single process all GPUs, or 1 process per GPU
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);
|
||||
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s %d-ranks GroupCall Different\n", isMultiProcess ? "MP" : "SP", totalRanks);
|
||||
@@ -139,7 +141,8 @@ namespace RcclUnitTesting
|
||||
{
|
||||
// Test either single process all GPUs, or 1 process per GPU
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup);
|
||||
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s %d-ranks GroupCall MixedDayaType\n", isMultiProcess ? "MP" : "SP", totalRanks);
|
||||
@@ -194,7 +197,8 @@ namespace RcclUnitTesting
|
||||
INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n",
|
||||
isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup);
|
||||
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks),
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder),
|
||||
numCollPerGroup, numStreamsPerGroup);
|
||||
|
||||
// Set up each collective in group in different stream (modulo numStreamsPerGroup)
|
||||
@@ -244,7 +248,8 @@ namespace RcclUnitTesting
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
|
||||
// Initialize comms by specifying the # of group calls
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);
|
||||
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s %d-ranks GroupCall MultiGroupCall\n", isMultiProcess ? "MP" : "SP", totalRanks);
|
||||
|
||||
@@ -34,7 +34,8 @@ namespace RcclUnitTesting
|
||||
{
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
// Initialize communicators in non-blocking mode
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking);
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), 1, 1, 1, useBlocking);
|
||||
|
||||
// Loop over various collective functions
|
||||
for (auto funcType : funcTypes)
|
||||
|
||||
@@ -27,7 +27,8 @@ namespace RcclUnitTesting
|
||||
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
|
||||
int totalRanks = numGpus * ranksPerGpu;
|
||||
int const numProcesses = isMultiProcess ? numGpus : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu, gpuPriorityOrder),
|
||||
{1,2}, //two group, second group sendrecv to self, has 2 coll
|
||||
testBed.GetNumStreamsPerGroup(1,2),
|
||||
2);
|
||||
@@ -119,7 +120,8 @@ namespace RcclUnitTesting
|
||||
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
|
||||
int totalRanks = numGpus * ranksPerGpu;
|
||||
int const numProcesses = isMultiProcess ? numGpus : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
|
||||
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu, gpuPriorityOrder),
|
||||
{1,2}, //two group, second group sendrecv to self, has 2 coll
|
||||
testBed.GetNumStreamsPerGroup(1,2),
|
||||
2);
|
||||
|
||||
@@ -123,6 +123,73 @@ namespace RcclUnitTesting
|
||||
return 0;
|
||||
}
|
||||
|
||||
ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
|
||||
char hexStr[17]; // Longest possible int64 hex string + null terminator.
|
||||
int hexOffset = 0;
|
||||
for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
|
||||
char c = busId[i];
|
||||
if (c == ':') continue;
|
||||
if (c == '.') break; //ignore everything after . as they belong to same physical pci
|
||||
if ((c >= '0' && c <= '9') ||
|
||||
(c >= 'A' && c <= 'F') ||
|
||||
(c >= 'a' && c <= 'f')) {
|
||||
hexStr[hexOffset++] = busId[i];
|
||||
} else break;
|
||||
}
|
||||
hexStr[hexOffset] = '\0';
|
||||
*id = strtol(hexStr, NULL, 16);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int getDevicePriority (std::vector<int> *gpuPriorityOrder){
|
||||
// Prepare parent->child pipe
|
||||
int pipefd[2];
|
||||
if (pipe(pipefd) == -1) {
|
||||
ERROR("Unable to create parent->child pipe for getting the device priority vector.\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
pid_t pid = fork();
|
||||
if (0 == pid) {
|
||||
std::vector<int> result;
|
||||
try {
|
||||
int numDev;
|
||||
hipGetDeviceCount(&numDev);
|
||||
std::unordered_map<int64_t, std::vector<int>> uniqueIdToGpuIndexes;
|
||||
for(int dev=0;dev<numDev;dev++){
|
||||
char busIdStr[] = "00000000:00:00.0";
|
||||
int64_t busId;
|
||||
hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), dev);
|
||||
busIdToInt64(busIdStr, &busId);
|
||||
uniqueIdToGpuIndexes[busId].push_back(dev);
|
||||
}
|
||||
std::vector<std::pair<int64_t, std::vector<int>>> sortedIds(uniqueIdToGpuIndexes.begin(), uniqueIdToGpuIndexes.end());
|
||||
std::sort(sortedIds.begin(), sortedIds.end(), [](const auto& a, const auto& b) {
|
||||
return a.second.size() > b.second.size();
|
||||
});
|
||||
for (const auto& pair : sortedIds) {
|
||||
result.insert(result.end(), pair.second.begin(), pair.second.end());
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
if (write(pipefd[1], result.data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
|
||||
close(pipefd[0]);
|
||||
close(pipefd[1]);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
else {
|
||||
int status;
|
||||
if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
|
||||
waitpid(pid, &status, 0);
|
||||
assert(!status);
|
||||
close(pipefd[0]);
|
||||
close(pipefd[1]);
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
EnvVars::EnvVars()
|
||||
{
|
||||
@@ -151,10 +218,18 @@ namespace RcclUnitTesting
|
||||
// Total number of reduction ops
|
||||
int numOps = ncclNumOps;
|
||||
|
||||
gpuPriorityOrder.resize(numDetectedGpus);
|
||||
for(int i=0;i<numDetectedGpus;i++){
|
||||
gpuPriorityOrder[i] = i;
|
||||
}
|
||||
bool isCpxMode = false;
|
||||
if(isGfx94) {
|
||||
getDeviceMode(&isCpxMode);
|
||||
if(isCpxMode) {
|
||||
getDevicePriority(&gpuPriorityOrder);
|
||||
}
|
||||
}
|
||||
|
||||
// Test only pow2 number of GPUs for cpx mode to reduce the runtime for UT
|
||||
onlyPow2Gpus = GetEnvVar("UT_POW2_GPUS" , isCpxMode); // Default value set based on whether system is in CPX mode. UT_POW2_GPUS set by user overrides it.
|
||||
|
||||
@@ -235,6 +310,11 @@ namespace RcclUnitTesting
|
||||
return numGpusList;
|
||||
}
|
||||
|
||||
std::vector<int> const& EnvVars::GetGpuPriorityOrder()
|
||||
{
|
||||
return gpuPriorityOrder;
|
||||
}
|
||||
|
||||
std::vector<int> const& EnvVars::GetIsMultiProcessList()
|
||||
{
|
||||
return isMultiProcessList;
|
||||
|
||||
@@ -41,6 +41,7 @@ namespace RcclUnitTesting
|
||||
|
||||
std::vector<int> const& GetNumGpusList();
|
||||
std::vector<int> const& GetIsMultiProcessList();
|
||||
std::vector<int> const& GetGpuPriorityOrder(); // Orders the gpus based on the associativity of them with OAM with higher gpus linked.
|
||||
void ShowConfig();
|
||||
|
||||
protected:
|
||||
@@ -49,6 +50,7 @@ namespace RcclUnitTesting
|
||||
std::vector<int> numGpusList; // List of # Gpus to use [UT_MIN_GPUS/UT_MAX_GPUS/UT_POW2_GPUS]
|
||||
std::vector<int> isMultiProcessList; // Single or multi process [UT_PROCESS_MASK]
|
||||
int numDetectedGpus;
|
||||
std::vector<int> gpuPriorityOrder; // Orders the gpus based on the associativity of them with OAM with higher gpus linked.
|
||||
|
||||
// Helper functions to parse environment variables
|
||||
int GetEnvVar(std::string const varname, int defaultValue);
|
||||
|
||||
@@ -193,7 +193,8 @@ namespace RcclUnitTesting
|
||||
|
||||
void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, int const numStreamsPerGroup, int const numGroupCalls, bool const useBlocking)
|
||||
{
|
||||
InitComms(TestBed::GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
|
||||
const std::vector<int>& gpuPriorityOrder = ev.GetGpuPriorityOrder();
|
||||
InitComms(GetDeviceIdsList(1, numGpus, gpuPriorityOrder), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
|
||||
}
|
||||
|
||||
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
|
||||
@@ -562,21 +563,23 @@ namespace RcclUnitTesting
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus)
|
||||
int const numGpus,
|
||||
const std::vector<int>& gpuPriorityOrder)
|
||||
{
|
||||
return GetDeviceIdsList(numProcesses, numGpus, 1);
|
||||
return GetDeviceIdsList(numProcesses, numGpus, 1, gpuPriorityOrder);
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus,
|
||||
int const ranksPerGpu)
|
||||
int const ranksPerGpu,
|
||||
const std::vector<int>& gpuPriorityOrder)
|
||||
{
|
||||
std::vector<std::vector<int>> result(numProcesses);
|
||||
int ntasks = numProcesses == 1 ? numGpus : 1;
|
||||
int k=0;
|
||||
for (int i = 0; i < numProcesses; i++)
|
||||
for (int j = 0; j < ntasks * ranksPerGpu; j++) {
|
||||
result[i].push_back(k%numGpus);
|
||||
result[i].push_back(gpuPriorityOrder[k%numGpus]);
|
||||
k++;
|
||||
}
|
||||
return result;
|
||||
@@ -668,7 +671,8 @@ namespace RcclUnitTesting
|
||||
if(enableSweep == false && (numGpus < 8 || numRanks < 8)) {
|
||||
continue;
|
||||
}
|
||||
this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
|
||||
const std::vector<int>& gpuPriorityOrder = ev.GetGpuPriorityOrder();
|
||||
this->InitComms(this->GetDeviceIdsList(numChildren, numGpus, ranksPerGpu, gpuPriorityOrder));
|
||||
if (testing::Test::HasFailure())
|
||||
{
|
||||
isCorrect = false;
|
||||
|
||||
@@ -136,9 +136,12 @@ namespace RcclUnitTesting
|
||||
// Helper function that splits up GPUs to the given number of processes
|
||||
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus,
|
||||
int const ranksPerGpu);
|
||||
int const ranksPerGpu,
|
||||
const std::vector<int>& gpuPriorityOrder);
|
||||
|
||||
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus);
|
||||
int const numGpus,
|
||||
const std::vector<int>& gpuPriorityOrder);
|
||||
|
||||
// Generate a test case name
|
||||
static std::string GetTestCaseName(int const totalRanks,
|
||||
|
||||
Verwijs in nieuw issue
Block a user