1012 wiersze
39 KiB
C++
1012 wiersze
39 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
|
|
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "core.h"
|
|
#include "graph.h"
|
|
#include "topo.h"
|
|
#include "comm.h"
|
|
#include "net.h"
|
|
#include "channel.h"
|
|
#include "transport.h"
|
|
#include "device.h"
|
|
#include "xml.h"
|
|
|
|
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
|
|
|
|
struct ncclTopoNodeList {
|
|
struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
|
|
int count;
|
|
};
|
|
|
|
static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
|
|
for (int i=0; i<system->nodes[t].count; i++) {
|
|
if (system->nodes[t].nodes[i].id == id) {
|
|
*path = node->paths[t]+i;
|
|
return ncclSuccess;
|
|
}
|
|
}
|
|
WARN("Could not find node of type %d id %lx", t, id);
|
|
return ncclInternalError;
|
|
}
|
|
|
|
NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
|
|
|
|
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
|
if (baseNode->paths[baseNode->type] == NULL) {
|
|
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
|
for (int i=0; i<system->nodes[baseNode->type].count; i++) baseNode->paths[baseNode->type][i].type = PATH_DIS;
|
|
}
|
|
|
|
// breadth-first search to set all paths to that node in the system
|
|
struct ncclTopoNodeList nodeList;
|
|
struct ncclTopoNodeList nextNodeList = { { 0 }, 0 };
|
|
nodeList.count = 1; nodeList.list[0] = baseNode;
|
|
struct ncclTopoLinkList* basePath;
|
|
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
|
|
basePath->count = 0;
|
|
basePath->bw = LOC_BW;
|
|
basePath->type = PATH_LOC;
|
|
|
|
while (nodeList.count) {
|
|
nextNodeList.count = 0;
|
|
for (int n=0; n<nodeList.count; n++) {
|
|
struct ncclTopoNode* node = nodeList.list[n];
|
|
struct ncclTopoLinkList* path;
|
|
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
|
|
for (int l=0; l<node->nlinks; l++) {
|
|
struct ncclTopoLink* link = node->links+l;
|
|
struct ncclTopoNode* remNode = link->remNode;
|
|
if (remNode->paths[baseNode->type] == NULL) {
|
|
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
|
for (int i=0; i<system->nodes[baseNode->type].count; i++) remNode->paths[baseNode->type][i].type = PATH_DIS;
|
|
}
|
|
struct ncclTopoLinkList* remPath;
|
|
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
|
float bw = std::min(path->bw, link->bw);
|
|
|
|
// allow routing through a GPU only as 1 hop
|
|
if (node != baseNode && node->type == GPU &&
|
|
(ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
|
|
|
|
if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
|
|
// Find reverse link
|
|
for (int l=0; l<remNode->nlinks; l++) {
|
|
if (remNode->links[l].remNode == node && remNode->links[l].type == link->type) {
|
|
remPath->list[0] = remNode->links+l;
|
|
break;
|
|
}
|
|
}
|
|
if (remPath->list[0] == NULL) {
|
|
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
|
|
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
|
|
return ncclInternalError;
|
|
}
|
|
// Copy the rest of the path
|
|
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
|
|
remPath->count = path->count + 1;
|
|
remPath->bw = bw;
|
|
|
|
// Start with path type = link type. PATH and LINK types are supposed to match.
|
|
// Don't consider LINK_NET as we only care about the NIC->GPU path.
|
|
int type = link->type == LINK_NET ? LINK_LOC : link->type;
|
|
// Differentiate between one and multiple PCI switches
|
|
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
|
|
// Consider a path going through the CPU as PATH_PHB
|
|
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
|
|
// Set 1 hop NVLink as NVB
|
|
//if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
|
|
|
|
remPath->type = std::max(path->type, type);
|
|
|
|
// Add to the list for the next iteration if not already in the list
|
|
// Disallow GPUs as intermediate steps for now
|
|
if (remNode->type != GPU) {
|
|
int i;
|
|
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
|
|
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
|
|
const int linesize = 2048;
|
|
char line[linesize];
|
|
#ifdef ENABLE_TRACE
|
|
INFO(NCCL_GRAPH, "Paths from %s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
|
|
#else
|
|
snprintf(line, linesize, "%s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
|
|
int offset = strlen(line);
|
|
#endif
|
|
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
|
|
if (node->paths[t] == NULL) continue;
|
|
for (int n = 0; n<system->nodes[t].count; n++) {
|
|
#ifdef ENABLE_TRACE
|
|
line[0] = 0;
|
|
int offset = 0;
|
|
for (int i=0; i<node->paths[t][n].count; i++) {
|
|
struct ncclTopoLink* link = node->paths[t][n].list[i];
|
|
struct ncclTopoNode* remNode = link->remNode;
|
|
snprintf(line+offset, linesize-offset, "--%s(%g)->%s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], NCCL_TOPO_ID_SYSTEM_ID(remNode->id), NCCL_TOPO_ID_LOCAL_ID(remNode->id));
|
|
offset = strlen(line);
|
|
}
|
|
INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
|
|
#else
|
|
snprintf(line+offset, linesize-offset, "%s/%lx-%lx (%d/%.1f/%s) ", topoNodeTypeStr[t], NCCL_TOPO_ID_SYSTEM_ID(system->nodes[t].nodes[n].id), NCCL_TOPO_ID_LOCAL_ID(system->nodes[t].nodes[n].id), node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]);
|
|
offset = strlen(line);
|
|
#endif
|
|
}
|
|
}
|
|
#ifndef ENABLE_TRACE
|
|
INFO(NCCL_GRAPH, "%s", line);
|
|
#endif
|
|
}
|
|
|
|
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
|
|
for (int i=0; i<system->nodes[GPU].count; i++) {
|
|
printNodePaths(system, system->nodes[GPU].nodes+i);
|
|
}
|
|
for (int i=0; i<system->nodes[NET].count; i++) {
|
|
printNodePaths(system, system->nodes[NET].nodes+i);
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
|
|
// Find the closest CPU to a GPU
|
|
int minHops = 0;
|
|
int localCpu = -1;
|
|
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
|
|
for (int c=0; c<system->nodes[CPU].count; c++) {
|
|
int hops = paths[c].count;
|
|
if (hops > 0 && (minHops == 0 || hops < minHops)) {
|
|
localCpu = c;
|
|
minHops = hops;
|
|
}
|
|
}
|
|
if (localCpu == -1) {
|
|
WARN("Error : could not find CPU close to GPU %d", gpu);
|
|
return ncclInternalError;
|
|
}
|
|
*retCpu = localCpu;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
|
|
struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
|
|
struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
|
|
|
|
int l=0;
|
|
// Node 1 -> CPU
|
|
for (int i=0; i<srcNode->paths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
|
|
// CPU -> Node 2
|
|
for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
|
|
|
|
// Update path characteristics
|
|
srcNode->paths[t2][i2].count = l;
|
|
srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
|
|
if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
|
|
srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
// Remove/free all paths
|
|
static void ncclTopoRemovePaths(struct ncclTopoSystem* system) {
|
|
for (int t1=0; t1<NCCL_TOPO_NODE_TYPES; t1++) {
|
|
for (int n=0; n<system->nodes[t1].count; n++) {
|
|
struct ncclTopoNode* node = system->nodes[t1].nodes+n;
|
|
for (int t2=0; t2<NCCL_TOPO_NODE_TYPES; t2++) {
|
|
if (node->paths[t2]) free(node->paths[t2]);
|
|
node->paths[t2] = NULL;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS };
|
|
ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
|
|
if (*level == -1) {
|
|
int l = -1;
|
|
if (disableEnv) {
|
|
const char* str = ncclGetEnv(disableEnv);
|
|
if (str) {
|
|
int disable = strtol(str, NULL, 0);
|
|
if (disable == 1) l = 0;
|
|
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable);
|
|
}
|
|
}
|
|
if (l == -1) {
|
|
const char* str = ncclGetEnv(levelEnv);
|
|
if (str) {
|
|
for (int i=0; i<=PATH_SYS; i++) {
|
|
if (strcmp(str, topoPathTypeStr[i]) == 0) {
|
|
l = i;
|
|
break;
|
|
}
|
|
}
|
|
// Old style numbering
|
|
// levelsOldToNew to is an array with each index corresponding to the
|
|
// "old level" int, and each value mapping to the correct value defined in topo.h
|
|
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
|
|
if (l == -1 && str[0] >= '0' && str[0] <= '9') {
|
|
int oldLevel = strtol(str, NULL, 0);
|
|
const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
|
|
if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
|
|
l = levelsOldToNew[oldLevel];
|
|
}
|
|
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
|
|
}
|
|
}
|
|
*level = l >= 0 ? l : -2;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
|
|
|
|
int ncclTopoUserP2pLevel = -1;
|
|
ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
|
|
int* p2p, int *read, int* intermediateRank) {
|
|
int mnnvl = 0;
|
|
struct ncclPeerInfo* info1 = NULL;
|
|
struct ncclPeerInfo* info2 = NULL;
|
|
*p2p = 0;
|
|
if (read) *read = 0;
|
|
if (intermediateRank) *intermediateRank = -1;
|
|
|
|
// Rule out different nodes / isolated containers
|
|
if (comm) {
|
|
info1 = comm->peerInfo+rank1;
|
|
info2 = comm->peerInfo+rank2;
|
|
if (info1->hostHash != info2->hostHash) {
|
|
if (comm->MNNVL) {
|
|
NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, &mnnvl));
|
|
if (!mnnvl) return ncclSuccess;
|
|
} else {
|
|
return ncclSuccess;
|
|
}
|
|
} else if (info1->shmDev != info2->shmDev) {
|
|
return ncclSuccess;
|
|
}
|
|
}
|
|
|
|
// Get GPUs from topology
|
|
int g1, g2;
|
|
NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
|
|
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
|
|
if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) {
|
|
// GPU not found, we can't use p2p.
|
|
return ncclSuccess;
|
|
}
|
|
|
|
int intermediateIndex = -1;
|
|
// Set intermediate GPU rank, if routing through an intermediate GPU.
|
|
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
|
|
if (path->count == 2) {
|
|
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
|
|
if (intermediateNode->type == GPU) {
|
|
intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
|
|
if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
|
|
}
|
|
}
|
|
|
|
// By default don't use P2P across CPU Host Bridges and further apart
|
|
int p2pLevel = PATH_PXB;
|
|
|
|
int arch, vendor, model;
|
|
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
|
// Allow P2P between pairs of GPUs on AMD systems
|
|
if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS;
|
|
|
|
// User override
|
|
if (ncclTopoUserP2pLevel == -1)
|
|
NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
|
|
if (ncclTopoUserP2pLevel != -2) {
|
|
p2pLevel = ncclTopoUserP2pLevel;
|
|
goto compare;
|
|
}
|
|
|
|
|
|
compare:
|
|
// Compute the PCI distance and compare with the p2pLevel.
|
|
if (path->type <= p2pLevel) *p2p = 1;
|
|
|
|
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
|
#else
|
|
if (*p2p == 1) {
|
|
// NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
|
|
// validate against NVML at all since they are pretending to be on other hw.
|
|
if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash &&
|
|
info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) {
|
|
int indexes[3] = {-1,-1,-1};
|
|
int verticeN = 0;
|
|
NCCLCHECK(ncclNvmlEnsureInitialized());
|
|
|
|
indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev;
|
|
if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev;
|
|
indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev;
|
|
|
|
for (int i=1; i < verticeN; i++) {
|
|
nvmlGpuP2PStatus_t status;
|
|
status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead;
|
|
bool good = status == NVML_P2P_STATUS_OK;
|
|
status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
|
|
good &= status == NVML_P2P_STATUS_OK;
|
|
if (!good) {
|
|
if (!ncclParamIgnoreDisabledP2p()) {
|
|
if (path->type <= PATH_NVB) {
|
|
WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
|
|
return ncclUnhandledCudaError;
|
|
} else if (path->type < PATH_SYS) {
|
|
INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
|
|
}
|
|
}
|
|
*p2p = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (path->type == PATH_NVL) {
|
|
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
|
|
// Enable P2P Read for Ampere/NVLink only
|
|
if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
|
|
}
|
|
|
|
return ncclSuccess;
|
|
}
|
|
|
|
// MNNVL: Check whether peers are in the same fabric cluster and clique
|
|
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret) {
|
|
*ret = 0;
|
|
|
|
nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo;
|
|
nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo;
|
|
// A zero UUID means we don't have MNNVL fabric info
|
|
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
|
|
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
|
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
|
INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
|
|
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
|
|
*ret = 1;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
|
int ncclTopoUserGdrLevel = -1;
|
|
|
|
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
|
|
*useGdr = 0;
|
|
|
|
// Get GPU and NET
|
|
int n, g;
|
|
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
|
|
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
|
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
|
|
|
// Check that both the NIC and GPUs support it
|
|
if (net->net.gdrSupport == 0) return ncclSuccess;
|
|
if (gpu->gpu.gdrSupport == 0) return ncclSuccess;
|
|
|
|
if (read) { // For reads (sends) only enable under certain conditions
|
|
int gdrReadParam = ncclParamNetGdrRead();
|
|
if (gdrReadParam == 0) return ncclSuccess;
|
|
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
|
#else
|
|
// Disable GDR Reads pre-Ampere when we have other PCI flows
|
|
if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) {
|
|
int nvlink = 0;
|
|
// Since we don't know whether there are other communicators,
|
|
// it's better to keep things local if we have a single GPU.
|
|
if (system->nodes[GPU].count == 1) nvlink = 1;
|
|
for (int i=0; i<system->nodes[GPU].count; i++) {
|
|
if (i == g) continue;
|
|
if (gpu->paths[GPU][i].type == PATH_NVL) {
|
|
nvlink = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (!nvlink) return ncclSuccess;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Check if we are close enough that it makes sense to enable GDR
|
|
int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel;
|
|
NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
|
|
if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
|
|
else {
|
|
int arch, vendor, model;
|
|
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
|
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
|
|
int i, d1 = -1, d2 = -1;
|
|
for (i = 0; i < system->nodes[CPU].count; i++)
|
|
if (system->nodes[GPU].nodes[g].paths[CPU][i].count == 2) break;
|
|
if (i <system->nodes[CPU].count) d1 = system->nodes[CPU].nodes[i].id;
|
|
for (i = 0; i < system->nodes[CPU].count; i++)
|
|
if (system->nodes[NET].nodes[n].paths[CPU][i].count == 2) break;
|
|
if (i <system->nodes[CPU].count) d2 = system->nodes[CPU].nodes[i].id;
|
|
if (d1 != -1 && d2 != -1 && d1 == d2 &&
|
|
(system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) {
|
|
netGdrLevel = PATH_PHB;
|
|
}
|
|
}
|
|
}
|
|
|
|
int distance = gpu->paths[NET][n].type;
|
|
if (distance == PATH_PXN) {
|
|
// In case of PXN, use the intermediate GPU distance instead
|
|
int proxyRank, g;
|
|
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
|
|
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
|
|
struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
|
|
distance = proxyGpu->paths[NET][n].type;
|
|
}
|
|
if (distance > netGdrLevel) {
|
|
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
*useGdr = 1;
|
|
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
|
|
int netNum = system->nodes[NET].count;
|
|
int useGdr = 0;
|
|
*avail = false;
|
|
for (int n = 0; n < netNum; n++) {
|
|
int64_t netId = system->nodes[NET].nodes[n].id;
|
|
NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 1, &useGdr));
|
|
if (useGdr) {
|
|
*avail = true;
|
|
break;
|
|
}
|
|
NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 0, &useGdr));
|
|
if (useGdr) {
|
|
*avail = true;
|
|
break;
|
|
}
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
// Set to 0 to disable the flush on Hopper when using GDR
|
|
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
|
|
|
|
// Determine whether we need to flush the GDR recv buffers
|
|
ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
|
|
*flush = 1;
|
|
ncclNetProperties_t props;
|
|
NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
|
|
if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess;
|
|
int g;
|
|
struct ncclTopoSystem* system = comm->topo;
|
|
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
|
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
|
*flush = 1;
|
|
#else
|
|
// Flush is required on Ampere and earlier
|
|
if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
|
|
#endif
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);
|
|
|
|
// Check whether going through the network would be faster than going through P2P/SHM.
|
|
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net) {
|
|
if (ncclParamNetDisableIntra() == 1) {
|
|
*net = 0;
|
|
return ncclSuccess;
|
|
}
|
|
*net = 1;
|
|
// First check the current GPU-to-GPU speed.
|
|
int g1, g2;
|
|
if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess ||
|
|
ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) {
|
|
return ncclSuccess;
|
|
}
|
|
|
|
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
|
|
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
|
|
float speed = gpu1->paths[GPU][g2].bw;
|
|
|
|
// Now check the speed each GPU can access the network through PXB or better
|
|
float netSpeed1 = 0, netSpeed2 = 0;
|
|
for (int n=0; n<system->nodes[NET].count; n++) {
|
|
struct ncclTopoLinkList* path = gpu1->paths[NET]+n;
|
|
if (path->type <= PATH_PXB && path->bw > netSpeed1) netSpeed1 = path->bw;
|
|
path = gpu2->paths[NET]+n;
|
|
if (path->type <= PATH_PXB && path->bw > netSpeed2) netSpeed2 = path->bw;
|
|
}
|
|
|
|
if (netSpeed1 > speed && netSpeed2 > speed) return ncclSuccess;
|
|
*net = 0;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank) {
|
|
// Get GPU and NET
|
|
int n, g;
|
|
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
|
|
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
|
struct ncclTopoLinkList* path = gpu->paths[NET]+n;
|
|
if (path->type == PATH_PXN) {
|
|
struct ncclTopoNode* node;
|
|
int type = NVS;
|
|
for (int i=0; i<path->count && type == NVS; i++) {
|
|
node = path->list[i]->remNode;
|
|
type = node->type;
|
|
}
|
|
if (type != GPU) {
|
|
WARN("Could not find intermediate GPU between GPU rank %d and NIC %lx", rank, netId);
|
|
return ncclInternalError;
|
|
}
|
|
*intermediateRank = node->gpu.rank;
|
|
} else {
|
|
*intermediateRank = rank;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
|
|
|
|
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
|
|
// remote proxies without risking deadlocks
|
|
int ncclPxnDisable(struct ncclComm* comm) {
|
|
static int pxnDisable = -1;
|
|
if (pxnDisable == -1) {
|
|
if (comm && ncclNetVersion(comm) == 4) {
|
|
INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
|
|
pxnDisable = 1;
|
|
} else {
|
|
pxnDisable = ncclParamPxnDisable();
|
|
}
|
|
}
|
|
return pxnDisable;
|
|
}
|
|
|
|
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) {
|
|
struct ncclTopoSystem* system = comm->topo;
|
|
*nranks = 0;
|
|
*intermediateRanks = NULL;
|
|
if (system->nodes[NET].count == 0) return ncclSuccess;
|
|
|
|
int nr = 0;
|
|
int* ranks = NULL;
|
|
for (int rank=0; rank<comm->nRanks; rank++) {
|
|
int64_t netId;
|
|
int proxyRank;
|
|
NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
|
|
if (proxyRank == comm->rank) continue;
|
|
int useGdr;
|
|
NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
|
|
if (useGdr == 0) continue;
|
|
int found = 0;
|
|
for (int r=0; r<nr; r++) {
|
|
if (ranks[r] == proxyRank) found = 1;
|
|
}
|
|
if (!found) {
|
|
NCCLCHECK(ncclRealloc(&ranks, nr, nr+1));
|
|
ranks[nr++] = proxyRank;
|
|
}
|
|
}
|
|
*nranks = nr;
|
|
*intermediateRanks = ranks;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static bool rcclPathOverride(struct ncclTopoSystem* system, uint64_t distance) {
|
|
int i, j;
|
|
|
|
for (i = 0; i < system->nodes[GPU].count; i++) {
|
|
for (j = 0; j < system->nodes[NET].count; j++) {
|
|
if ((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) || (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
|
|
break;
|
|
}
|
|
if (j >= system->nodes[NET].count)
|
|
break;
|
|
}
|
|
if (i >= system->nodes[GPU].count) {
|
|
for (i = 0; i < system->nodes[GPU].count; i++) {
|
|
for (j = 0; j < system->nodes[NET].count; j++) {
|
|
if ((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) || (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
|
|
system->nodes[GPU].nodes[i].paths[NET][j].type = PATH_PXB;
|
|
}
|
|
}
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
|
|
// Precompute paths between GPUs/NICs.
|
|
|
|
// Remove everything in case we're re-computing
|
|
ncclTopoRemovePaths(system);
|
|
|
|
// Set direct paths to CPUs. We need them in many cases.
|
|
for (int c=0; c<system->nodes[CPU].count; c++) {
|
|
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
|
|
}
|
|
|
|
// Set direct paths to GPUs.
|
|
for (int g=0; g<system->nodes[GPU].count; g++) {
|
|
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
|
|
}
|
|
|
|
// Set direct paths to NICs.
|
|
for (int n=0; n<system->nodes[NET].count; n++) {
|
|
NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system));
|
|
}
|
|
|
|
// Set direct paths to NVSwitches.
|
|
for (int n=0; n<system->nodes[NVS].count; n++) {
|
|
NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system));
|
|
}
|
|
|
|
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
|
|
for (int g=0; g<system->nodes[GPU].count; g++) {
|
|
for (int p=0; p<system->nodes[GPU].count; p++) {
|
|
int p2p;
|
|
NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank,
|
|
system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
|
|
if (p2p == 0) {
|
|
// Divert all traffic through the CPU
|
|
int cpu;
|
|
NCCLCHECK(ncclGetLocalCpu(system, g, &cpu));
|
|
NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
|
|
}
|
|
}
|
|
|
|
if (comm == NULL) continue;
|
|
// Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
|
|
struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank;
|
|
for (int p=0; p<system->nodes[GPU].count; p++) {
|
|
if (p == g) continue;
|
|
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
|
|
int p2p;
|
|
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, comm, NULL, srcInfo, dstInfo));
|
|
if (p2p == 0) {
|
|
int shm;
|
|
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, comm, NULL, srcInfo, dstInfo));
|
|
if (shm == 0) {
|
|
// Mark this peer as inaccessible. We'll trim it later.
|
|
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Special handling of gfx94x and gfx950
|
|
|
|
#if !defined(TOPO_EXPL)
|
|
char strValue[1024];
|
|
NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
|
|
if (strncmp("Hyper-V UEFI Release", strValue, 20) == 0) {
|
|
#endif
|
|
int arch, vendor, model;
|
|
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
|
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
|
|
(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) &&
|
|
((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
|
|
(system->nodes[GPU].count != system->nRanks))) {
|
|
if (!rcclPathOverride(system, 0x100000) && !rcclPathOverride(system, 0x1000))
|
|
rcclPathOverride(system, 0xff00000);
|
|
}
|
|
#if !defined(TOPO_EXPL)
|
|
}
|
|
#endif
|
|
|
|
// Update paths for NICs (no GPU Direct, PXN, ...)
|
|
for (int n=0; n<system->nodes[NET].count; n++) {
|
|
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
|
|
|
|
for (int g=0; g<system->nodes[GPU].count; g++) {
|
|
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
|
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
|
if (ncclPxnDisable(comm) != 1) {
|
|
int localGpuIndex;
|
|
NCCLCHECK(ncclTopoGetLocalGpu(system, netNode->id, &localGpuIndex));
|
|
if (localGpuIndex != g && localGpuIndex != -1) {
|
|
// PXN = PCI + NVLink.
|
|
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
|
|
// Only use PXN for NIC n if remote GPU p ...
|
|
if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
|
|
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
|
|
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us
|
|
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
|
|
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
|
|
// We can use that GPU as relay to communicate with that NIC.
|
|
// Only enabling it in the GPU->NIC direction for now to favor
|
|
// receiving locally and sending remotely (consistent with net.cc)
|
|
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
|
|
}
|
|
}
|
|
if (gpu->paths[NET][n].type < PATH_PHB) {
|
|
// Update path when we dont want to / can't use GPU Direct RDMA.
|
|
int gdr;
|
|
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
|
|
if (gdr == 0) {
|
|
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
|
|
int localCpu;
|
|
NCCLCHECK(ncclGetLocalCpu(system, g, &localCpu));
|
|
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
|
|
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2);
|
|
|
|
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
|
|
ncclResult_t ret = ncclSuccess;
|
|
int *domains;
|
|
int64_t *ids = NULL;
|
|
int myDomain = 0;
|
|
int ngpus = system->nodes[GPU].count;
|
|
int remove = 1;
|
|
int gdr = 1;
|
|
bool allXgmi = true;
|
|
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
|
|
NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail);
|
|
for (int g=0; g<system->nodes[GPU].count; g++) {
|
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
|
domains[g] = g;
|
|
ids[g] = gpu->id;
|
|
for (int p=0; p<g; p++) {
|
|
if (gpu->paths[GPU][p].type < PATH_NET) {
|
|
domains[g] = std::min(domains[g], domains[p]);
|
|
}
|
|
}
|
|
if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
|
|
}
|
|
|
|
for (int i=0; i<ngpus; i++) {
|
|
if (domains[i] == myDomain) continue;
|
|
struct ncclTopoNode* gpu = NULL;
|
|
int g;
|
|
for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
|
|
gpu = system->nodes[GPU].nodes+g;
|
|
if (gpu->id == ids[i]) break; else gpu=NULL;
|
|
}
|
|
if (gpu == NULL) {
|
|
WARN("Could not find id %lx", ids[i]);
|
|
ret = ncclInternalError;
|
|
goto fail;
|
|
}
|
|
NCCLCHECKGOTO(ncclTopoRemoveNode(system, GPU, g), ret, fail);
|
|
}
|
|
|
|
// trim low speed port on same NIC
|
|
for (int i = 0; i < system->nodes[NET].count; i ++) {
|
|
for (int j = 0; j < system->nodes[NET].count; j ++) {
|
|
if (i == j) continue;
|
|
if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
|
|
if (system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw)
|
|
system->nodes[NET].nodes[j].net.bw = 0;
|
|
}
|
|
}
|
|
}
|
|
do {
|
|
int n;
|
|
for (n=0; n<system->nodes[NET].count; n++) {
|
|
if (system->nodes[NET].nodes[n].net.bw == 0) break;
|
|
}
|
|
if (n<system->nodes[NET].count) {
|
|
NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
|
|
}
|
|
else
|
|
break;
|
|
} while (system->nodes[NET].count);
|
|
|
|
// detect if all GPUs are connected by XGMI
|
|
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
|
|
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
|
|
for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
|
|
if (i == j) continue;
|
|
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
|
|
bool isXGMI;
|
|
NCCLCHECKGOTO(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI), ret, fail);
|
|
allXgmi &= isXGMI;
|
|
}
|
|
}
|
|
if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
|
|
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
|
int64_t netId;
|
|
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &netId, nullptr), ret, fail);
|
|
NCCLCHECKGOTO(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netId, 1, &gdr), ret, fail);
|
|
if (!gdr) break;
|
|
}
|
|
if (gdr && !allXgmi) {
|
|
remove = 0;
|
|
system->type |= RCCL_TOPO_GDR_ALL;
|
|
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
|
|
}
|
|
|
|
if (rcclParamEnableIntranet() == 1) {
|
|
remove = 0;
|
|
system->type |= RCCL_TOPO_FORCE_INTRA;
|
|
}
|
|
|
|
comm->localRanks = system->nodes[GPU].count;
|
|
if (system->nodes[GPU].count == comm->nRanks && remove) {
|
|
for (int n=system->nodes[NET].count-1; n>=0; n--)
|
|
NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
|
|
}
|
|
exit:
|
|
free(domains);
|
|
if (ids) free(ids);
|
|
return ret;
|
|
fail:
|
|
goto exit;
|
|
}
|
|
|
|
void ncclTopoFree(struct ncclTopoSystem* system) {
|
|
ncclTopoRemovePaths(system);
|
|
free(system);
|
|
}
|
|
|
|
NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1);
|
|
NCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", -2);
|
|
|
|
static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) {
|
|
int peer;
|
|
struct ncclTopoSystem* system = comm->topo;
|
|
struct ncclTopoLinkList* path = NULL;
|
|
if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
|
|
// Same rank
|
|
if (g == peer) {
|
|
*nChannels = -1;
|
|
return ncclSuccess;
|
|
}
|
|
// Local rank
|
|
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
|
|
if (path->type == PATH_NVL) {
|
|
float nvlBw = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
|
|
*nChannels = ((IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
|
|
} else {
|
|
*nChannels = 2;
|
|
}
|
|
} else {
|
|
// Remote rank, use network
|
|
int nNetChannels = ncclParamNChannelsPerNetPeer();
|
|
if (nNetChannels == -1) {
|
|
//start from 2 channels per NIC and reduce with scale
|
|
nNetChannels = 2;
|
|
|
|
// check if we need to use more than one NIC, hence more than one channel
|
|
int netCountByBw = 1, nChannelsMax = nNetChannels;
|
|
NCCLCHECK(getLocalNetCountByBw(system, g, &netCountByBw));
|
|
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
|
|
while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
|
|
|
|
//allow upto channels requires to drive the NICs
|
|
nNetChannels = std::max(netCountByBw, nChannelsMax);
|
|
}
|
|
*nChannels = nNetChannels;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
|
|
NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
|
|
extern int64_t ncclParamWorkArgsBytes();
|
|
|
|
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
|
/* here we already honor comm->max/minCTAs for p2pnChannels. */
|
|
if (comm->sharedRes->owner != comm) {
|
|
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
|
|
comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()), comm->sharedRes->tpP2pNChannels);
|
|
} else {
|
|
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
|
|
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
|
|
}
|
|
|
|
int minChannels = comm->p2pnChannels;
|
|
// We need to loop through all local GPUs to have a global picture
|
|
for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
|
|
for (int r=0; r<comm->nRanks; r++) {
|
|
int nChannels;
|
|
NCCLCHECK(ncclTopoGetNchannels(comm, g, r, &nChannels));
|
|
if (nChannels >= 0) minChannels = std::min(minChannels, nChannels);
|
|
}
|
|
}
|
|
|
|
int arch, vendor, model;
|
|
NCCLCHECK(ncclTopoCpuType(comm->topo, &arch, &vendor, &model));
|
|
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
|
|
// Adjust P2P channels on Intel platform
|
|
comm->p2pnChannelsPerPeer = 8;
|
|
comm->p2pnChannels = 8;
|
|
} else if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
|
|
// Adjust P2P channels on Rome
|
|
comm->p2pnChannelsPerPeer = 2;
|
|
comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())));
|
|
} else {
|
|
// Round to next pow2 nChannelsPerPeer and nChannels
|
|
comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer());
|
|
// Doubling P2P channels per peer on single node
|
|
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2;
|
|
comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT);
|
|
// p2pnChannelsPerPeer cannot be greater than MAXCHANNELS
|
|
comm->p2pnChannelsPerPeer = std::min(comm->p2pnChannelsPerPeer, MAXCHANNELS);
|
|
}
|
|
|
|
// Init channels that weren't used so far
|
|
for (int c=comm->nChannels; c<std::max(comm->nChannels, comm->p2pnChannels); c++) NCCLCHECK(initChannel(comm, c));
|
|
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) {
|
|
int ngpus = system->nodes[GPU].count;
|
|
NCCLCHECK(ncclCalloc(ranks, ngpus));
|
|
int nvbGpus = 0;
|
|
for (int g=0; g<ngpus; g++) {
|
|
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
|
if (gpu->gpu.rank != rank) continue;
|
|
for (int p=0; p<ngpus; p++) {
|
|
if (gpu->paths[GPU][p].type == PATH_NVB) {
|
|
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
|
|
}
|
|
}
|
|
}
|
|
*nranks = nvbGpus;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) {
|
|
int minPath = PATH_SYS;
|
|
for (int i=0; i<system->nodes[GPU].count; i++) {
|
|
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
|
|
if (paths == NULL) continue;
|
|
for (int j=0; j<system->nodes[type].count; j++) {
|
|
if (type == GPU && i == j) continue;
|
|
minPath = std::min(minPath, paths[j].type);
|
|
}
|
|
}
|
|
*min = minPath;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) {
|
|
int maxPath = PATH_LOC;
|
|
for (int i=0; i<system->nodes[GPU].count; i++) {
|
|
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
|
|
if (paths == NULL) continue;
|
|
for (int j=0; j<system->nodes[type].count; j++) {
|
|
if (type == GPU && i == j) continue;
|
|
maxPath = std::max(maxPath, paths[j].type);
|
|
}
|
|
}
|
|
*max = maxPath;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) {
|
|
int maxPath;
|
|
NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath));
|
|
*allNvLink = maxPath >= PATH_PIX ? 0 : 1;
|
|
return ncclSuccess;
|
|
}
|