From f5c0b243a8bdf249f4e9831ef8cd5cd61f288601 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Fri, 12 Aug 2022 09:52:29 -0700 Subject: [PATCH] Add XGMI sys type and clean up detection code (#597) --- src/graph/paths.cc | 50 +++++++++++++++---------------- src/graph/search.cc | 71 +++++++++++++++++++++++---------------------- src/graph/topo.h | 1 + 3 files changed, 60 insertions(+), 62 deletions(-) diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 762c6a639c..64fbba59a8 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -653,34 +653,30 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { - int gdr, ret = 1; - int net; - for (int g = 0; g < system->nodes[GPU].count; g++) { - NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net)); - NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); - if (!gdr) { - ret = 0; - break; + int gdr = 1; + bool allXgmi = true; + // detect if all GPUs are connected by XGMI + for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { + int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; + for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { + if (i == j) continue; + int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; + bool isXGMI; + NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); + allXgmi &= isXGMI; } } - if (ret) { - bool allXgmi = true; - // don't trim NICs unless all GPUs are connected by XGMI - for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { - int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; - for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { - if (i == j) continue; - int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; - bool isXGMI; - NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); - allXgmi &= isXGMI; - } - } - if (!allXgmi) { - remove = 0; - system->type |= RCCL_TOPO_GDR_ALL; - INFO(NCCL_GRAPH, "GDR is available on all GPUs"); - } + if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL; + for (int g = 0; g < system->nodes[GPU].count; g++) { + int net; + NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net)); + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); + if (!gdr) break; + } + if (gdr && !allXgmi) { + remove = 0; + system->type |= RCCL_TOPO_GDR_ALL; + INFO(NCCL_GRAPH, "GDR is available on all GPUs"); } } @@ -755,7 +751,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { } } - if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && (comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)) { + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) { // Adjust P2P channels on Rome comm->p2pnChannelsPerPeer = 2; comm->p2pnChannels = 2; diff --git a/src/graph/search.cc b/src/graph/search.cc index edbafc0dc1..4b5e5ff66f 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -1171,7 +1171,6 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int *inter) { int interGpus[MAX_XGMI_INTER_GPUS+1]; int ngpus = system->nodes[GPU].count; - *isXGMI = false; // check for direct XGMI connection for (int i=0; inodes[GPU].nodes[i].gpu.dev == cudaDev1) { @@ -1188,42 +1187,44 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in } } } - if (maxInter == 0) return ncclSuccess; - // check if there are intermediate GPUs that are connected to both - bool res1, res2, res3; - int j; - for (j=0; j 0 && inter != nullptr) { - ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0); - if (res2) { - *isXGMI = true; - return ncclSuccess; + // try intermediate GPUs + if (maxInter) { + // check if there are intermediate GPUs that are connected to both + bool res1, res2, res3; + int j; + for (j=0; j ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess; - for (int i=0; inodes[GPU].nodes[i].gpu.dev; - // skip duplicated GPU - if (dev == cudaDev2) continue; - for (j=0; j 0 && inter != nullptr) { + ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0); + if (res2) { + *isXGMI = true; + return ncclSuccess; + } + memcpy(interGpus+1, inter+1, sizeof(int)*nInter); + } + interGpus[0] = cudaDev1; + // add one more intermediate GPU recursively util reaching max depth + nInter++; + if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess; + for (int i=0; inodes[GPU].nodes[i].gpu.dev; + // skip duplicated GPU + if (dev == cudaDev2) continue; + for (j=0; j