From b250c01cbe06af0935461fecfa052c9a7d28c9ef Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Wed, 6 Jul 2022 07:58:41 -0700 Subject: [PATCH] Use nontemporal in slow path and add XGMI sys type (#575) * Use nontemporal in slow path and add XGMI sys type * Clean up XGMI detection --- src/collectives/device/common_kernel.h | 12 ++--- src/graph/paths.cc | 50 +++++++++--------- src/graph/search.cc | 71 +++++++++++++------------- src/graph/topo.h | 1 + 4 files changed, 66 insertions(+), 68 deletions(-) diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index d6fa08f186..e117f70561 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -425,12 +425,12 @@ struct MULTI { template inline __device__ T vFetch(const volatile T* ptr) { - return *ptr; + return __builtin_nontemporal_load(ptr); } template inline __device__ void vStore(volatile T* ptr, const T val) { - *ptr = val; + __builtin_nontemporal_store(val, ptr); } #if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)) @@ -449,25 +449,25 @@ void vStore(volatile half* ptr, const half val) { template<> inline __device__ half vFetch(const volatile half* ptr) { half r; - r = ((half*)ptr)[0]; + r = __builtin_nontemporal_load((uint16_t*)ptr); return r; } template<> inline __device__ void vStore(volatile half* ptr, const half val) { - ((half*)ptr)[0] = val; + __builtin_nontemporal_store(val, (uint16_t*)ptr); } template<> inline __device__ rccl_bfloat16 vFetch(const volatile rccl_bfloat16* ptr) { rccl_bfloat16 r; - r.data = ptr->data; + r.data = __builtin_nontemporal_load(&ptr->data); return r; } template<> inline __device__ void vStore(volatile rccl_bfloat16* ptr, const rccl_bfloat16 val) { - ptr->data = val.data; + __builtin_nontemporal_store(val.data, &ptr->data); } #endif diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 762c6a639c..64fbba59a8 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -653,34 +653,30 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { - int gdr, ret = 1; - int net; - for (int g = 0; g < system->nodes[GPU].count; g++) { - NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net)); - NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); - if (!gdr) { - ret = 0; - break; + int gdr = 1; + bool allXgmi = true; + // detect if all GPUs are connected by XGMI + for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { + int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; + for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { + if (i == j) continue; + int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; + bool isXGMI; + NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); + allXgmi &= isXGMI; } } - if (ret) { - bool allXgmi = true; - // don't trim NICs unless all GPUs are connected by XGMI - for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { - int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; - for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { - if (i == j) continue; - int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; - bool isXGMI; - NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); - allXgmi &= isXGMI; - } - } - if (!allXgmi) { - remove = 0; - system->type |= RCCL_TOPO_GDR_ALL; - INFO(NCCL_GRAPH, "GDR is available on all GPUs"); - } + if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL; + for (int g = 0; g < system->nodes[GPU].count; g++) { + int net; + NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net)); + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); + if (!gdr) break; + } + if (gdr && !allXgmi) { + remove = 0; + system->type |= RCCL_TOPO_GDR_ALL; + INFO(NCCL_GRAPH, "GDR is available on all GPUs"); } } @@ -755,7 +751,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { } } - if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && (comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)) { + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) { // Adjust P2P channels on Rome comm->p2pnChannelsPerPeer = 2; comm->p2pnChannels = 2; diff --git a/src/graph/search.cc b/src/graph/search.cc index edbafc0dc1..4b5e5ff66f 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -1171,7 +1171,6 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int *inter) { int interGpus[MAX_XGMI_INTER_GPUS+1]; int ngpus = system->nodes[GPU].count; - *isXGMI = false; // check for direct XGMI connection for (int i=0; inodes[GPU].nodes[i].gpu.dev == cudaDev1) { @@ -1188,42 +1187,44 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in } } } - if (maxInter == 0) return ncclSuccess; - // check if there are intermediate GPUs that are connected to both - bool res1, res2, res3; - int j; - for (j=0; j 0 && inter != nullptr) { - ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0); - if (res2) { - *isXGMI = true; - return ncclSuccess; + // try intermediate GPUs + if (maxInter) { + // check if there are intermediate GPUs that are connected to both + bool res1, res2, res3; + int j; + for (j=0; j ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess; - for (int i=0; inodes[GPU].nodes[i].gpu.dev; - // skip duplicated GPU - if (dev == cudaDev2) continue; - for (j=0; j 0 && inter != nullptr) { + ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0); + if (res2) { + *isXGMI = true; + return ncclSuccess; + } + memcpy(interGpus+1, inter+1, sizeof(int)*nInter); + } + interGpus[0] = cudaDev1; + // add one more intermediate GPU recursively util reaching max depth + nInter++; + if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess; + for (int i=0; inodes[GPU].nodes[i].gpu.dev; + // skip duplicated GPU + if (dev == cudaDev2) continue; + for (j=0; j