Virtual device enablement ( Minimal changes ) (#2110)

* minimal changes

* Setting Default tuning table

* Add warnings NIC merge accross PCIe Root complexes,NUMA

---------

Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com>

[ROCm/rccl commit: 6f62165369]
This commit is contained in:
Avinash
2025-12-25 15:06:33 -06:00
committato da GitHub
parent f221a1ae08
commit 2585ae8815
5 ha cambiato i file con 155 aggiunte e 5 eliminazioni
+1 -2
Vedi File
@@ -1462,9 +1462,8 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail);
}
NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
// [RCCL] Disabled virtual devices
NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state,
comm->ncclNet->getProperties, nullptr /*comm->ncclNet->makeVDevice*/, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail);
comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail);
pthread_mutex_unlock(&netLock);
netLockHeld = 0;
+21
Vedi File
@@ -915,3 +915,24 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm,
*time = lat * latCount + nBytes / (1000 * bw);
return ncclSuccess;
}
/**
* takes gfx arch name as C-style string and returns a tuning index to
*/
int rcclGetTuningIndexForArch(const char* gfxarch) {
static const std::vector<std::pair<std::string, int>> tuningIndexMap = {
{"gfx906", 0}, {"gfx908", 0}, {"gfx90a", 0}, {"gfx942", 5},
{"gfx950", 6}, {"gfx1030", 0}, {"gfx1100", 0}, {"gfx1102", 0},
{"gfx1200", 7}, {"gfx1201", 7}
};
if (gfxarch == nullptr) return 0;
std::string arch(gfxarch);
for (const auto& p : tuningIndexMap) {
const std::string& prefix = p.first;
if (arch.size() >= prefix.size() &&
arch.compare(0, prefix.size(), prefix) == 0) {
return p.second;
}
}
return 0;
}
+1 -1
Vedi File
@@ -144,5 +144,5 @@ ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* tr
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time);
int rcclGetTuningIndexForArch(const char* gfxarch);
#endif
+2 -1
Vedi File
@@ -1246,9 +1246,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
if (dumpXmlFile) {
NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
}
// Topo detection / System graph creation
NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
comm->topo->tuning = rcclGetTuningIndexForArch(comm->archName);
INFO(NCCL_INIT, "Tuning index set to: %d", comm->topo->tuning);
// save nRanks to ncclTopoSystem as indicator of multi-node
comm->topo->nRanks = comm->nRanks;
// init netGdrLevel
+130 -1
Vedi File
@@ -559,9 +559,113 @@ failure:
return false;
}
/**
* Extract the PCIe root complex (domain:bus) from a Linux sysfs PCI device path.
*
* This function scans a sysfs PCI device path (e.g.
* "/sys/devices/pci0000:80/0000:80:03.1/0000:81:00.0")
* and extracts the first occurrence of the PCI root host bridge identifier
* of the form "pciDDDD:BB", where:
* - DDDD is the PCI domain (hex)
* - BB is the PCI bus number (hex)
*
* The extracted root is returned as a string formatted as "DDDD:BB".
*
* This helper is typically used to determine whether multiple NICs or devices
* reside under the same PCIe root complex, which is important for validating
* NIC merging and avoiding cross-root PCIe traffic.
*
* @param pciPath NUL-terminated sysfs PCI device path
* @param root Output buffer receiving the "DDDD:BB" PCI root identifier
* @param rootLen Size of the output buffer; must be >= 8
*
* @return ncclSuccess on success
* @return ncclInvalidUsage if inputs are invalid or no PCI root is found
*
* Notes:
* - The function scans for the first valid "pciDDDD:BB" pattern in the path.
* - The loop is bounded by the length of the string and cannot hang.
* - This function does not allocate memory.
*/
static ncclResult_t ncclIbGetPciRootFromPath(
const char* pciPath,
char* root,
size_t rootLen
) {
if (pciPath == NULL || root == NULL || rootLen < 8){
return ncclInvalidUsage;
}
const char* p = strstr(pciPath, "pci");
while (p != NULL) {
int domain, bus;
int chars_read = 0;
if (sscanf(p, "pci%4x:%2x%n", &domain, &bus, &chars_read) == 2 &&
chars_read == 10) {
snprintf(root, rootLen, "%04x:%02x", domain, bus);
return ncclSuccess;
}
p = strstr(p + 1, "pci");
}
return ncclInvalidUsage;
}
/**
* Determine the NUMA node associated with a PCI device from its sysfs path.
*
* This function reads the "numa_node" attribute from a PCI device's sysfs
* directory (e.g. "<pciPath>/numa_node") and returns the NUMA node ID.
*
* Linux sysfs convention:
* - A non-negative integer indicates the NUMA node the device is local to
* - "-1" indicates that the device has no specific NUMA affinity
*
* This helper is used to validate NUMA locality when merging NICs, ensuring
* that merged devices do not silently span NUMA nodes, which could negatively
* impact performance.
*
* @param pciPath NUL-terminated sysfs PCI device path
*
* @return NUMA node ID (>= 0) on success
* @return -1 if the NUMA node cannot be determined, the file is missing,
* unreadable, or contains invalid data
*
* Notes:
* - Uses open/read instead of stdio to avoid buffering and locale issues.
* - Uses strtol for robust numeric parsing and overflow detection.
* - A return value of -1 may indicate either "no NUMA affinity" or an error;
* callers should treat it as "unknown or unspecified".
*/
static int ncclIbGetNumaNodeFromPath(const char* pciPath) {
if (pciPath == NULL) {
return -1;
}
char numaPath[PATH_MAX];
if (snprintf(numaPath, sizeof(numaPath), "%s/numa_node", pciPath) >= PATH_MAX) {
return -1;
}
int fd = open(numaPath, O_RDONLY);
if (fd < 0) return -1;
char buf[32];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n <= 0) return -1;
buf[n] = '\0';
char* endptr;
errno = 0;
long numa = strtol(buf, &endptr, 10);
if (endptr == buf || errno == ERANGE) {
return -1;
}
return (int)numa;
}
ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
if (ncclParamIbMergeNics() == 0 && props->ndevs > 1) {
INFO(NCCL_NET, "NET/IB : Skipping makeVDevice, NCCL_IB_MERGE_NICS=0");
WARN("NET/IB : Skipping makeVDevice, Please set NCCL_IB_MERGE_NICS=1");
return ncclInvalidUsage;
}
@@ -609,6 +713,31 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
}
}
int numa0 = ncclIbGetNumaNodeFromPath(dev0->pciPath);
//format -> 0000:00
char root0[8];
ncclIbGetPciRootFromPath(dev0->pciPath, root0, sizeof(root0));
for (int i = 1; i < props->ndevs; i++) {
ncclIbDev* dev = ncclIbDevs + props->devs[i];
int numa_i = ncclIbGetNumaNodeFromPath(dev->pciPath);
if (numa0 >= 0 && numa_i >= 0 && numa_i != numa0) {
WARN("NET/IB : Merging NICs across NUMA nodes (%s numa=%d, %s numa=%d). "
"This may significantly reduce performance.",
dev0->devName, numa0, dev->devName, numa_i);
break;
}
char root_i[8];
ncclIbGetPciRootFromPath(dev->pciPath, root_i, sizeof(root_i));
if (strcmp(root_i, root0) != 0) {
WARN("NET/IB : Merging NICs across PCIe Root Complexes "
"(%s root=%s, %s root=%s). "
"GPUDirect RDMA and bandwidth aggregation may be impacted.",
dev0->devName, root0, dev->devName, root_i);
break;
}
}
*d = ncclNMergedIbDevs++;
INFO(NCCL_NET, "NET/IB : Made virtual device [%d] name=%s speed=%d ndevs=%d", *d, mDev->devName, mDev->speed, mDev->vProps.ndevs);
return ncclSuccess;