Use nontemporal in slow path and add XGMI sys type (#575)

* Use nontemporal in slow path and add XGMI sys type

* Clean up XGMI detection
This commit is contained in:
Wenkai Du
2022-07-06 07:58:41 -07:00
committed by GitHub
parent 00af1f64e9
commit b250c01cbe
4 changed files with 66 additions and 68 deletions
+6 -6
View File
@@ -425,12 +425,12 @@ struct MULTI<FUNC, int64_t> {
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
return *ptr;
return __builtin_nontemporal_load(ptr);
}
template<typename T> inline __device__
void vStore(volatile T* ptr, const T val) {
*ptr = val;
__builtin_nontemporal_store(val, ptr);
}
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
@@ -449,25 +449,25 @@ void vStore<half>(volatile half* ptr, const half val) {
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
half r;
r = ((half*)ptr)[0];
r = __builtin_nontemporal_load((uint16_t*)ptr);
return r;
}
template<> inline __device__
void vStore<half>(volatile half* ptr, const half val) {
((half*)ptr)[0] = val;
__builtin_nontemporal_store(val, (uint16_t*)ptr);
}
template<> inline __device__
rccl_bfloat16 vFetch<rccl_bfloat16>(const volatile rccl_bfloat16* ptr) {
rccl_bfloat16 r;
r.data = ptr->data;
r.data = __builtin_nontemporal_load(&ptr->data);
return r;
}
template<> inline __device__
void vStore<rccl_bfloat16>(volatile rccl_bfloat16* ptr, const rccl_bfloat16 val) {
ptr->data = val.data;
__builtin_nontemporal_store(val.data, &ptr->data);
}
#endif
+23 -27
View File
@@ -653,34 +653,30 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
&& model == NCCL_TOPO_CPU_TYPE_ROME) {
int gdr, ret = 1;
int net;
for (int g = 0; g < system->nodes[GPU].count; g++) {
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
if (!gdr) {
ret = 0;
break;
int gdr = 1;
bool allXgmi = true;
// detect if all GPUs are connected by XGMI
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
if (i == j) continue;
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
bool isXGMI;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
allXgmi &= isXGMI;
}
}
if (ret) {
bool allXgmi = true;
// don't trim NICs unless all GPUs are connected by XGMI
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
if (i == j) continue;
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
bool isXGMI;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
allXgmi &= isXGMI;
}
}
if (!allXgmi) {
remove = 0;
system->type |= RCCL_TOPO_GDR_ALL;
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
}
if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
for (int g = 0; g < system->nodes[GPU].count; g++) {
int net;
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
if (!gdr) break;
}
if (gdr && !allXgmi) {
remove = 0;
system->type |= RCCL_TOPO_GDR_ALL;
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
}
}
@@ -755,7 +751,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
}
}
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && (comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)) {
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
// Adjust P2P channels on Rome
comm->p2pnChannelsPerPeer = 2;
comm->p2pnChannels = 2;
+36 -35
View File
@@ -1171,7 +1171,6 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int *inter) {
int interGpus[MAX_XGMI_INTER_GPUS+1];
int ngpus = system->nodes[GPU].count;
*isXGMI = false;
// check for direct XGMI connection
for (int i=0; i<ngpus; i++) {
if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
@@ -1188,42 +1187,44 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in
}
}
}
if (maxInter == 0) return ncclSuccess;
// check if there are intermediate GPUs that are connected to both
bool res1, res2, res3;
int j;
for (j=0; j<nInter; j++) {
bool res1;
ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
if (!res1) break;
}
if (j<nInter) return ncclSuccess;
if (nInter > 0 && inter != nullptr) {
ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
if (res2) {
*isXGMI = true;
return ncclSuccess;
// try intermediate GPUs
if (maxInter) {
// check if there are intermediate GPUs that are connected to both
bool res1, res2, res3;
int j;
for (j=0; j<nInter; j++) {
ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
if (!res1) break;
}
memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
}
interGpus[0] = cudaDev1;
// add one more intermediate GPU recursively util reaching max depth
nInter++;
if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
for (int i=0; i<ngpus; i++) {
int dev = system->nodes[GPU].nodes[i].gpu.dev;
// skip duplicated GPU
if (dev == cudaDev2) continue;
for (j=0; j<nInter; j++)
if (dev == interGpus[j]) break;
if (j<nInter) continue;
// check connectivity with intermediate GPUs
interGpus[nInter] = dev;
ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
if (res3) {
*isXGMI = true;
return ncclSuccess;
if (j<nInter) return ncclSuccess;
if (nInter > 0 && inter != nullptr) {
ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
if (res2) {
*isXGMI = true;
return ncclSuccess;
}
memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
}
interGpus[0] = cudaDev1;
// add one more intermediate GPU recursively util reaching max depth
nInter++;
if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
for (int i=0; i<ngpus; i++) {
int dev = system->nodes[GPU].nodes[i].gpu.dev;
// skip duplicated GPU
if (dev == cudaDev2) continue;
for (j=0; j<nInter; j++)
if (dev == interGpus[j]) break;
if (j<nInter) continue;
// check connectivity with intermediate GPUs
interGpus[nInter] = dev;
ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
if (res3) {
*isXGMI = true;
return ncclSuccess;
}
}
}
*isXGMI = false;
return ncclSuccess;
}
+1
View File
@@ -104,6 +104,7 @@ struct ncclTopoLinkList {
#define RCCL_TOPO_GDR_ALL 4
#define RCCL_TOPO_16P1H 8
#define RCCL_TOPO_FORCE_INTRA 16
#define RCCL_TOPO_XGMI_ALL 32
#define RCCL_TOPO_MAX_RANKS_PER_GPU 8
struct ncclTopoNode {