Files
rocm-systems/src/graph/topo.cc
T
Sylvain Jeaugey ab2b89c4c3 2.21.5-1
Add support for IB SHARP 1PPN operation with user buffers.
Improve support for MNNVL, add NVLS support and multi-clique support.
 * Detect the NVLS clique through NVML
 * Exchange XML between peers in the same NVLS clique and fuse XMLs
   before creating the topology graph.
 * Rework bootstrap allgather algorithms to allow for large allgather
   operations intra-node (XML exchange).
Net/IB: add support for dynamic GID detection.
 * Automatically select RoCEv2/IPv4 interface by default. Allow to
   select IPv6 or even the network/mask.
Reduce NVLS memory usage.
 * Add stepSize as property of a connection to allow for different
   sizes on different peers; set it to 128K for NVLink SHARP.
Improve tuner loading
 * Look for more paths, be more consistent with the network device
   plugin.
 * Also search for tuner support inside the net plugin.
Improve tuner API
 * Add context to support multi-device per process.
Add magic number around comm object to detect comm corruption.
 * Add some basic check around communicators so that we can report a
   problem when a communicator gets corrupted or a wrong comm pointer
   is passed to NCCL.
Fix net/IB error path. Github PR #1164
Fix collnet rail mapping with split comm.
Fix packet reordering issue causing bootstrap mismatch
 * Use a different tag in ncclTransportP2pSetup for the connectInfo
   exchange and the following barrier.
Fix hang when crossNic is inconsistent between ranks.
Fix minCompCap/maxCompCap computation. Github issue #1184
2024-04-02 01:53:21 -07:00

968 wiersze
38 KiB
C++

/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "nvmlwrap.h"
#include "net.h"
#include "coll_net.h"
#include <sys/stat.h>
#include <fcntl.h>
#include "xml.h"
#include "cpuset.h"
#include "bootstrap.h"
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
/******************************************************************/
/******************* Graph Creation Functions *********************/
/******************************************************************/
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
char* str = path+offset;
// Remove trailing "/"
if (*str == '/') str--;
// Find next /
while (*str != '/') str--;
str++;
int64_t numid;
NCCLCHECK(busIdToInt64(str, &numid));
// Ignore subdevice because those should use the same PCI link so we want to merge nodes.
numid -= numid & 0xf;
*id = numid;
return ncclSuccess;
}
static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
*cpu = NULL;
if (node->type == CPU) {
*cpu = node;
return ncclSuccess;
}
for (int l=0; l<node->nlinks; l++) {
if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
if (*cpu != NULL) return ncclSuccess;
}
return ncclSuccess;
}
int interCpuBw = 0;
int cpuPciBw = 0;
static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
*bw = LOC_BW;
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
*bw = P9_BW;
return ncclSuccess;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
*bw = ARM_BW;
return ncclSuccess;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
*bw = AMD_BW;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
}
return ncclSuccess;
}
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceUnknown,
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
for (int i=0; i<system->nodes[type].count; i++) {
if (system->nodes[type].nodes[i].id == id) {
*node = system->nodes[type].nodes+i;
return ncclSuccess;
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d", type);
return ncclInternalError;
}
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
system->nodes[type].count++;
n->type = type;
n->id = id;
if (type == GPU) {
// Create link to itself (used in some corner cases)
n->nlinks=1;
n->links[0].type = LINK_LOC;
n->links[0].remNode = n;
n->links[0].bw = LOC_BW;
n->gpu.dev = NCCL_TOPO_UNDEF;
n->gpu.rank = NCCL_TOPO_UNDEF;
n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
} else if (type == CPU) {
n->cpu.arch = NCCL_TOPO_UNDEF;
n->cpu.vendor = NCCL_TOPO_UNDEF;
n->cpu.model = NCCL_TOPO_UNDEF;
} else if (type == NET) {
n->net.asic = 0ULL;
n->net.port = NCCL_TOPO_UNDEF;
n->net.bw = 0.0;
n->net.latency = 0.0;
}
*node = n;
return ncclSuccess;
}
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
free(delNode->paths[t]);
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
if (node == delNode) continue;
for (int l=0; l<node->nlinks; l++) {
while (l<node->nlinks && node->links[l].remNode == delNode) {
memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
node->nlinks--;
}
if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
node->links[l].remNode--;
}
}
}
}
memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
system->nodes[type].count--;
return ncclSuccess;
}
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {
// Aggregate links into higher bw for NVLink
struct ncclTopoLink* link;
for (link = node->links; link - node->links != NCCL_TOPO_MAX_LINKS && link->remNode; link++) {
if (link->remNode == remNode && link->type == type) break;
}
if (link - node->links == NCCL_TOPO_MAX_LINKS) {
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
return ncclInternalError;
}
if (link->remNode == NULL) node->nlinks++;
link->type = type;
link->remNode = remNode;
link->bw += bw;
// Sort links in BW descending order
struct ncclTopoLink linkSave;
memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
while (link != node->links) {
if ((link-1)->bw >= linkSave.bw) break;
memcpy(link, link-1, sizeof(struct ncclTopoLink));
link--;
}
memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
return ncclSuccess;
}
// BCM Gen4 Switches present themselves as a two-level hierarchical switch
// even though they're supposed to sustain full BW across all ports.
// Flatten the switch as this extra level can break the search and make
// NCCL take wrong topology decisions.
int getBcmGen(uint64_t id, int level) {
if ((id & 0xfffffffffffff000) == 0x1000c0101000a000) return 4;
if ((id & 0xfffffffffffff000) == (0x1000c03010000000 | level*0x1000)) return 5;
return 0;
}
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
for (int s=0; s<system->nodes[PCI].count; s++) {
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
int gen = getBcmGen(pciSwitch->pci.device, 0);
// Flatten Gen4 PEX switches in base mode
if (gen) {
// Find sub switches with the same device ID.
int64_t* subSwIds;
NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
int subs = 0;
for (int l=0; l<pciSwitch->nlinks; l++) {
struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
// Only fuse sub switches with the same device ID.
if (sub->type != PCI || getBcmGen(sub->pci.device, 1) != gen) continue;
// Save sub switch for later
subSwIds[subs++] = sub->id;
// Remove link to that sub switch
memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink)));
pciSwitch->nlinks--;
// Don't increase l for the next iteration as we just shifted all links by one.
l--;
}
for (int s=0; s<subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
// Connect all sub PCI devices to the parent switch
for (int l=0; l<sub->nlinks; l++) {
struct ncclTopoNode* remNode = sub->links[l].remNode;
if (remNode == pciSwitch) continue;
// Add link from parent PCI switch -> PCI device
if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) {
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
return ncclInternalError;
}
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
pciSwitch->nlinks++;
// Update link from PCI device -> parent PCI switch
for (int rl=0; rl<remNode->nlinks; rl++) {
if (remNode->links[rl].remNode == sub) {
remNode->links[rl].remNode = pciSwitch;
break;
}
}
}
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
}
// Set subdevice to 0xffff to make sure we don't merge this switch again.
pciSwitch->pci.device |= 0xffff;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
struct ncclTopoNode* cpu1 = system->nodes[CPU].nodes+n;
for (int p=0; p<system->nodes[CPU].count; p++) {
struct ncclTopoNode* cpu2 = system->nodes[CPU].nodes+p;
if (n == p || (NCCL_TOPO_ID_SYSTEM_ID(cpu1->id) != NCCL_TOPO_ID_SYSTEM_ID(cpu2->id))) continue;
float bw;
NCCLCHECK(ncclTopoGetInterCpuBw(cpu1, &bw));
NCCLCHECK(ncclTopoConnectNodes(cpu1, cpu2, LINK_SYS, bw));
}
}
return ncclSuccess;
}
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
if (node->type == GPU) {
sprintf(line+offset, "%s/%lx-%lx (%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->gpu.rank);
} else if (node->type == CPU) {
sprintf(line+offset, "%s/%lx-%lx (%d/%d/%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if (node->type == PCI) {
sprintf(line+offset, "%s/%lx-%lx (%lx)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->pci.device);
} else {
sprintf(line+offset, "%s/%lx-%lx", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
}
INFO(NCCL_GRAPH, "%s", line);
for (int i=0; i<offset; i++) line[i] = ' ';
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) continue;
if (link->type != LINK_PCI || link->remNode != prevNode) {
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
int nextOffset = strlen(line);
if (link->type == LINK_PCI) {
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if (link->remNode->type == NET) {
sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
} else {
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
}
INFO(NCCL_GRAPH, "%s", line);
}
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
INFO(NCCL_GRAPH, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw);
char line[1024];
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
INFO(NCCL_GRAPH, "==========================================");
NCCLCHECK(ncclTopoPrintPaths(s));
return ncclSuccess;
}
static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
// Shift all links to have upLink as last link
if (upNode) {
int l=0;
while (node->links[l].remNode != upNode) l++;
struct ncclTopoLink upLink;
memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
while (node->links[l+1].remNode) {
memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
l++;
}
memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
}
// Recursively sort the PCI tree
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
}
return ncclSuccess;
}
// We want the graph to be organized to ease/accelerate traversal :
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
// 4. SYS (already the case)
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
return ncclSuccess;
}
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) {
int dev;
NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
struct ncclTopoNode* net;
NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev)));
net->net.dev = dev;
const char* str;
NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
if (str) sscanf(str, "0x%lx", &net->net.asic);
else net->net.asic = dev;
ncclDebugNoWarn = NCCL_GRAPH;
int mbps;
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
net->net.bw = mbps / 8000.0;
if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
ncclDebugNoWarn = 0;
NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
return ncclSuccess;
}
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) {
for (int s=0; s<xmlNic->nSubs; s++) {
struct ncclXmlNode* xmlNet = xmlNic->subs[s];
if (strcmp(xmlNet->name, "net") != 0) continue;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
if (index == -1) continue;
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId));
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
// Do not go any further, nvlinks will be added in a second pass
return ncclSuccess;
}
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
struct kvDict kvDictPciGen[] = {
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) {
const char* str;
int type;
NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
int64_t busId;
NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
NCCLCHECK(busIdToInt64(str, &busId));
struct ncclTopoNode* node = NULL;
struct ncclXmlNode* xmlGpu = NULL;
NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
if (xmlGpu != NULL) {
type = GPU;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
if (index == -1) return ncclSuccess;
NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId)));
NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
}
struct ncclXmlNode* xmlNic = NULL;
NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
if (xmlNic != NULL) {
type = NIC;
// Ignore sub device ID and merge multi-port NICs into one PCI device.
busId &= 0xfffffffffffffff0;
struct ncclTopoNode* nicNode = NULL;
int64_t id = NCCL_TOPO_ID(systemId, busId);
NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id));
if (nicNode == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id));
node = nicNode; // Connect it to parent later on
}
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, systemId));
} else if (type == PCI) {
NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId)));
NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 48;
NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 32;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 16;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
if (str) node->pci.device += strtol(str, NULL, 0);
for (int s=0; s<xmlPci->nSubs; s++) {
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
}
}
if (node) {
int width, speed;
NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
// Manage cases where speed was not indicated in /sys
if (width == 0) width = 16;
NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
}
return ncclSuccess;
}
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
ncclResult_t ncclGetSystemId(struct ncclTopoSystem* system, struct ncclXmlNode* xmlCpu, int* systemIdPtr) {
const char* hostHashStr;
NCCLCHECK(xmlGetAttr(xmlCpu, "host_hash", &hostHashStr));
uint64_t hostHash = hostHashStr ? strtoull(hostHashStr, NULL, 16) : 0;
int systemId;
for (systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == hostHash) break;
if (systemId == system->nHosts) system->hostHashes[system->nHosts++] = hostHash;
*systemIdPtr = systemId;
return ncclSuccess;
}
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
int numaId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
int systemId;
NCCLCHECK(ncclGetSystemId(system, xmlCpu, &systemId));
struct ncclTopoNode* cpu;
NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, NCCL_TOPO_ID(systemId, numaId)));
const char* str;
NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
if (str != NULL) {
NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
}
NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
int familyId, modelId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
} else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
int familyId, modelId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
}
}
for (int s=0; s<xmlCpu->nSubs; s++) {
struct ncclXmlNode* node = xmlCpu->subs[s];
if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId));
if (strcmp(node->name, "nic") == 0) {
struct ncclTopoNode* nic = NULL;
NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
if (nic == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0)));
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
}
NCCLCHECK(ncclTopoAddNic(node, system, nic, systemId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
if (strcmp(node->name, "nvlink") == 0) {
struct ncclTopoNode* gpu = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
pBusId = NCCL_TOPO_ID(systemId, pBusId);
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx", pBusId);
return ncclInternalError;
}
int count;
NCCLCHECK(xmlGetAttrInt(node, "count", &count));
const char* targetClass;
NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
int targetType;
NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
struct ncclTopoNode* remote = NULL;
if (targetType == GPU) {
// NVL P2P connection to another GPU
const char* target;
NCCLCHECK(xmlGetAttrStr(node, "target", &target));
int64_t busId;
NCCLCHECK(busIdToInt64(target, &busId));
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
} else if (targetType == CPU) {
// NVL connection to the local CPU
NCCLCHECK(findLocalCpu(gpu, &remote));
} else {
if (system->nodes[NVS].count == 0) {
NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
} else {
remote = system->nodes[NVS].nodes;
}
}
if (remote) {
float nvlBw = ncclTopoNVLinkBw(gpu->gpu.cudaCompCap);
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlBw));
if (remote->type != GPU) {
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlBw));
}
}
} else {
if (strcmp(node->name, "cpu") == 0) {
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
}
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId, systemId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
if (strcmp(node->name, "c2c") == 0) {
struct ncclTopoNode* gpu = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
pBusId = NCCL_TOPO_ID(systemId, pBusId);
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx", pBusId);
return ncclInternalError;
}
int count = 0;
NCCLCHECK(xmlGetAttrInt(node, "count", &count));
int bw = 0;
NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
double c2cBw = (bw*count)/1000.0;
struct ncclTopoNode* cpu = NULL;
NCCLCHECK(findLocalCpu(gpu, &cpu));
if (cpu == NULL) return ncclSuccess;
NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
} else {
if (strcmp(node->name, "cpu") == 0) {
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
}
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId, systemId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {
NCCLCHECK(ncclCalloc(topoSystem, 1));
struct ncclTopoSystem* system = *topoSystem;
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
for (int s=0; s<topNode->nSubs; s++) {
struct ncclXmlNode* node = topNode->subs[s];
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
}
for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
NCCLCHECK(ncclTopoSortSystem(*topoSystem));
return ncclSuccess;
}
NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
// Only set values if not already set
static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
}
return ncclSuccess;
}
static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
}
return ncclSuccess;
}
static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
struct ncclXml* xml;
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
} else {
// Try default XML topology location
NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
}
if (xml->maxIndex == 0) {
// Create top tag
struct ncclXmlNode* top;
NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
}
// Auto-detect GPUs if needed
for (int r=0; r<comm->nRanks; r++) {
if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
}
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
if (collNetSupport(comm)) {
NCCLCHECK(collNetDevices(comm, &netDevCount));
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
}
}
if (netDevCount == 0) {
NCCLCHECK(comm->ncclNet->devices(&netDevCount));
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
if (comm->MNNVL) {
// MNNVL clique support
char* mem;
NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* cliqueXml;
NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
for (int i = 0; i < comm->clique.size; i++) {
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
}
free(xml);
xml = cliqueXml;
}
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
}
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
free(xml);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
int minType = PATH_DIS;
float maxBw = 0;
int count = 0;
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
for (int i=0; i<system->nodes[resultType].count; i++) {
if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
maxBw = paths[i].bw;
minType = paths[i].type;
if (pathType) *pathType = minType;
count = 0;
}
if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
}
*localCount = count;
return ncclSuccess;
}
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
int localNetCount = 0, netCountByBw = 0;
int* localNets;
float totalNetBw = 0, gpuBw = 0;
for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
//assuming BW to CPU reflects the GPU bandwidth via P2P or C2C
//caveat, this could be wrong if there is a PCIe switch,
//and a narrower link to the CPU
if (system->nodes[GPU].nodes[gpu].links[l].remNode->type == CPU) {
gpuBw = system->nodes[GPU].nodes[gpu].links[l].bw;
}
}
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
}
*count = netCountByBw;
free(localNets);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int* localNets;
int localNetCount;
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
int* localGpus = NULL;
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
net += channelId%(DIVUP(localNetCount,localGpuCount));
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
free(localNets);
free(localGpus);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
int netIndex;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
int* localGpus = NULL;
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
for (int c=0; c<MAXCHANNELS; c++) {
for (int lg=0; lg<localGpuCount; lg++) {
int g = localGpus[lg];
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int64_t id;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
if (netId == id) {
*gpuIndex = g;
free(localGpus);
return ncclSuccess;
}
}
}
free(localGpus);
*gpuIndex = -1;
return ncclSuccess;
}
/****************************/
/* External query functions */
/****************************/
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) {
*arch = system->nodes[CPU].nodes[0].cpu.arch;
*vendor = system->nodes[CPU].nodes[0].cpu.vendor;
*model = system->nodes[CPU].nodes[0].cpu.model;
return ncclSuccess;
}
NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
struct ncclTopoNode* cpu = NULL, *gpu = NULL;
for (int g=0; g<system->nodes[GPU].count; g++) {
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
gpu = system->nodes[GPU].nodes+g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for (int c=0; c<system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if (cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes+cpuIndex;
}
}
if (cpu == NULL) {
WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
return ncclInternalError;
}
// Query the CPU affinity set we were provided
cpu_set_t mask;
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
}
#endif
// Get the affinity of the CPU close to our GPU.
cpu_set_t cpuMask = cpu->cpu.affinity;
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
}
#endif
cpu_set_t finalMask;
if (ncclParamIgnoreCpuAffinity())
// Ignore the CPU affinity set and use the GPU one instead
finalMask = cpuMask;
else
// Use a subset of the GPU affinity set
CPU_AND(&finalMask, &mask, &cpuMask);
memcpy(affinity, &finalMask, sizeof(cpu_set_t));
// If there is a non empty set, use it to set affinity
if (CPU_COUNT(&finalMask)) {
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[GPU].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[NVS].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
if (system->nodes[GPU].count == 0) return ncclInternalError;
int min, max;
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
for (int g=1; g<system->nodes[GPU].count; g++) {
min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
}
if (ccMin) *ccMin = min;
if (ccMax) *ccMax = max;
return ncclSuccess;
}