From 6f62165369459e32fee4d04bcd3761ef160db71f Mon Sep 17 00:00:00 2001 From: Avinash <44542533+PJAvinash@users.noreply.github.com> Date: Thu, 25 Dec 2025 15:06:33 -0600 Subject: [PATCH] Virtual device enablement ( Minimal changes ) (#2110) * minimal changes * Setting Default tuning table * Add warnings NIC merge accross PCIe Root complexes,NUMA --------- Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com> --- src/graph/topo.cc | 3 +- src/graph/tuning.cc | 21 +++++++ src/include/graph.h | 2 +- src/init.cc | 3 +- src/transport/net_ib.cc | 131 +++++++++++++++++++++++++++++++++++++++- 5 files changed, 155 insertions(+), 5 deletions(-) diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 503cf1aeda..a53b5aecdb 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -1462,9 +1462,8 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail); } NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail); - // [RCCL] Disabled virtual devices NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state, - comm->ncclNet->getProperties, nullptr /*comm->ncclNet->makeVDevice*/, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail); + comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail); pthread_mutex_unlock(&netLock); netLockHeld = 0; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 1cdcf9edab..7ec4911d07 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -915,3 +915,24 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, *time = lat * latCount + nBytes / (1000 * bw); return ncclSuccess; } + +/** + * takes gfx arch name as C-style string and returns a tuning index to + */ +int rcclGetTuningIndexForArch(const char* gfxarch) { + static const std::vector> tuningIndexMap = { + {"gfx906", 0}, {"gfx908", 0}, {"gfx90a", 0}, {"gfx942", 5}, + {"gfx950", 6}, {"gfx1030", 0}, {"gfx1100", 0}, {"gfx1102", 0}, + {"gfx1200", 7}, {"gfx1201", 7} + }; + if (gfxarch == nullptr) return 0; + std::string arch(gfxarch); + for (const auto& p : tuningIndexMap) { + const std::string& prefix = p.first; + if (arch.size() >= prefix.size() && + arch.compare(0, prefix.size(), prefix) == 0) { + return p.second; + } + } + return 0; +} \ No newline at end of file diff --git a/src/include/graph.h b/src/include/graph.h index 7b5bb95378..130c754642 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -144,5 +144,5 @@ ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* tr ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time); - +int rcclGetTuningIndexForArch(const char* gfxarch); #endif diff --git a/src/init.cc b/src/init.cc index 86187bd35d..0fce2ddb42 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1246,9 +1246,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p if (dumpXmlFile) { NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail); } - // Topo detection / System graph creation NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail); + comm->topo->tuning = rcclGetTuningIndexForArch(comm->archName); + INFO(NCCL_INIT, "Tuning index set to: %d", comm->topo->tuning); // save nRanks to ncclTopoSystem as indicator of multi-node comm->topo->nRanks = comm->nRanks; // init netGdrLevel diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 7af56a6c95..ee83fffffa 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -559,9 +559,113 @@ failure: return false; } +/** + * Extract the PCIe root complex (domain:bus) from a Linux sysfs PCI device path. + * + * This function scans a sysfs PCI device path (e.g. + * "/sys/devices/pci0000:80/0000:80:03.1/0000:81:00.0") + * and extracts the first occurrence of the PCI root host bridge identifier + * of the form "pciDDDD:BB", where: + * - DDDD is the PCI domain (hex) + * - BB is the PCI bus number (hex) + * + * The extracted root is returned as a string formatted as "DDDD:BB". + * + * This helper is typically used to determine whether multiple NICs or devices + * reside under the same PCIe root complex, which is important for validating + * NIC merging and avoiding cross-root PCIe traffic. + * + * @param pciPath NUL-terminated sysfs PCI device path + * @param root Output buffer receiving the "DDDD:BB" PCI root identifier + * @param rootLen Size of the output buffer; must be >= 8 + * + * @return ncclSuccess on success + * @return ncclInvalidUsage if inputs are invalid or no PCI root is found + * + * Notes: + * - The function scans for the first valid "pciDDDD:BB" pattern in the path. + * - The loop is bounded by the length of the string and cannot hang. + * - This function does not allocate memory. + */ +static ncclResult_t ncclIbGetPciRootFromPath( + const char* pciPath, + char* root, + size_t rootLen +) { + if (pciPath == NULL || root == NULL || rootLen < 8){ + return ncclInvalidUsage; + } + const char* p = strstr(pciPath, "pci"); + while (p != NULL) { + int domain, bus; + int chars_read = 0; + if (sscanf(p, "pci%4x:%2x%n", &domain, &bus, &chars_read) == 2 && + chars_read == 10) { + snprintf(root, rootLen, "%04x:%02x", domain, bus); + return ncclSuccess; + } + p = strstr(p + 1, "pci"); + } + return ncclInvalidUsage; +} + +/** + * Determine the NUMA node associated with a PCI device from its sysfs path. + * + * This function reads the "numa_node" attribute from a PCI device's sysfs + * directory (e.g. "/numa_node") and returns the NUMA node ID. + * + * Linux sysfs convention: + * - A non-negative integer indicates the NUMA node the device is local to + * - "-1" indicates that the device has no specific NUMA affinity + * + * This helper is used to validate NUMA locality when merging NICs, ensuring + * that merged devices do not silently span NUMA nodes, which could negatively + * impact performance. + * + * @param pciPath NUL-terminated sysfs PCI device path + * + * @return NUMA node ID (>= 0) on success + * @return -1 if the NUMA node cannot be determined, the file is missing, + * unreadable, or contains invalid data + * + * Notes: + * - Uses open/read instead of stdio to avoid buffering and locale issues. + * - Uses strtol for robust numeric parsing and overflow detection. + * - A return value of -1 may indicate either "no NUMA affinity" or an error; + * callers should treat it as "unknown or unspecified". + */ +static int ncclIbGetNumaNodeFromPath(const char* pciPath) { + if (pciPath == NULL) { + return -1; + } + char numaPath[PATH_MAX]; + if (snprintf(numaPath, sizeof(numaPath), "%s/numa_node", pciPath) >= PATH_MAX) { + return -1; + } + + int fd = open(numaPath, O_RDONLY); + if (fd < 0) return -1; + + char buf[32]; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + + if (n <= 0) return -1; + buf[n] = '\0'; + + char* endptr; + errno = 0; + long numa = strtol(buf, &endptr, 10); + if (endptr == buf || errno == ERANGE) { + return -1; + } + return (int)numa; +} + ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) { if (ncclParamIbMergeNics() == 0 && props->ndevs > 1) { - INFO(NCCL_NET, "NET/IB : Skipping makeVDevice, NCCL_IB_MERGE_NICS=0"); + WARN("NET/IB : Skipping makeVDevice, Please set NCCL_IB_MERGE_NICS=1"); return ncclInvalidUsage; } @@ -609,6 +713,31 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) { } } + int numa0 = ncclIbGetNumaNodeFromPath(dev0->pciPath); + //format -> 0000:00 + char root0[8]; + ncclIbGetPciRootFromPath(dev0->pciPath, root0, sizeof(root0)); + for (int i = 1; i < props->ndevs; i++) { + ncclIbDev* dev = ncclIbDevs + props->devs[i]; + int numa_i = ncclIbGetNumaNodeFromPath(dev->pciPath); + if (numa0 >= 0 && numa_i >= 0 && numa_i != numa0) { + WARN("NET/IB : Merging NICs across NUMA nodes (%s numa=%d, %s numa=%d). " + "This may significantly reduce performance.", + dev0->devName, numa0, dev->devName, numa_i); + break; + } + + char root_i[8]; + ncclIbGetPciRootFromPath(dev->pciPath, root_i, sizeof(root_i)); + if (strcmp(root_i, root0) != 0) { + WARN("NET/IB : Merging NICs across PCIe Root Complexes " + "(%s root=%s, %s root=%s). " + "GPUDirect RDMA and bandwidth aggregation may be impacted.", + dev0->devName, root0, dev->devName, root_i); + break; + } + } + *d = ncclNMergedIbDevs++; INFO(NCCL_NET, "NET/IB : Made virtual device [%d] name=%s speed=%d ndevs=%d", *d, mDev->devName, mDev->speed, mDev->vProps.ndevs); return ncclSuccess;