Use nontemporal in slow path and add XGMI sys type (#575)
* Use nontemporal in slow path and add XGMI sys type * Clean up XGMI detection
This commit is contained in:
@@ -425,12 +425,12 @@ struct MULTI<FUNC, int64_t> {
|
||||
|
||||
template<typename T> inline __device__
|
||||
T vFetch(const volatile T* ptr) {
|
||||
return *ptr;
|
||||
return __builtin_nontemporal_load(ptr);
|
||||
}
|
||||
|
||||
template<typename T> inline __device__
|
||||
void vStore(volatile T* ptr, const T val) {
|
||||
*ptr = val;
|
||||
__builtin_nontemporal_store(val, ptr);
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
|
||||
@@ -449,25 +449,25 @@ void vStore<half>(volatile half* ptr, const half val) {
|
||||
template<> inline __device__
|
||||
half vFetch<half>(const volatile half* ptr) {
|
||||
half r;
|
||||
r = ((half*)ptr)[0];
|
||||
r = __builtin_nontemporal_load((uint16_t*)ptr);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<> inline __device__
|
||||
void vStore<half>(volatile half* ptr, const half val) {
|
||||
((half*)ptr)[0] = val;
|
||||
__builtin_nontemporal_store(val, (uint16_t*)ptr);
|
||||
}
|
||||
|
||||
template<> inline __device__
|
||||
rccl_bfloat16 vFetch<rccl_bfloat16>(const volatile rccl_bfloat16* ptr) {
|
||||
rccl_bfloat16 r;
|
||||
r.data = ptr->data;
|
||||
r.data = __builtin_nontemporal_load(&ptr->data);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<> inline __device__
|
||||
void vStore<rccl_bfloat16>(volatile rccl_bfloat16* ptr, const rccl_bfloat16 val) {
|
||||
ptr->data = val.data;
|
||||
__builtin_nontemporal_store(val.data, &ptr->data);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
+23
-27
@@ -653,34 +653,30 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
|
||||
&& model == NCCL_TOPO_CPU_TYPE_ROME) {
|
||||
int gdr, ret = 1;
|
||||
int net;
|
||||
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
|
||||
if (!gdr) {
|
||||
ret = 0;
|
||||
break;
|
||||
int gdr = 1;
|
||||
bool allXgmi = true;
|
||||
// detect if all GPUs are connected by XGMI
|
||||
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
|
||||
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
|
||||
for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
|
||||
if (i == j) continue;
|
||||
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
|
||||
bool isXGMI;
|
||||
NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
|
||||
allXgmi &= isXGMI;
|
||||
}
|
||||
}
|
||||
if (ret) {
|
||||
bool allXgmi = true;
|
||||
// don't trim NICs unless all GPUs are connected by XGMI
|
||||
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
|
||||
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
|
||||
for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
|
||||
if (i == j) continue;
|
||||
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
|
||||
bool isXGMI;
|
||||
NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
|
||||
allXgmi &= isXGMI;
|
||||
}
|
||||
}
|
||||
if (!allXgmi) {
|
||||
remove = 0;
|
||||
system->type |= RCCL_TOPO_GDR_ALL;
|
||||
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
|
||||
}
|
||||
if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
|
||||
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
||||
int net;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
|
||||
if (!gdr) break;
|
||||
}
|
||||
if (gdr && !allXgmi) {
|
||||
remove = 0;
|
||||
system->type |= RCCL_TOPO_GDR_ALL;
|
||||
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -755,7 +751,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && (comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)) {
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
|
||||
// Adjust P2P channels on Rome
|
||||
comm->p2pnChannelsPerPeer = 2;
|
||||
comm->p2pnChannels = 2;
|
||||
|
||||
+36
-35
@@ -1171,7 +1171,6 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int *inter) {
|
||||
int interGpus[MAX_XGMI_INTER_GPUS+1];
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
*isXGMI = false;
|
||||
// check for direct XGMI connection
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
|
||||
@@ -1188,42 +1187,44 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in
|
||||
}
|
||||
}
|
||||
}
|
||||
if (maxInter == 0) return ncclSuccess;
|
||||
// check if there are intermediate GPUs that are connected to both
|
||||
bool res1, res2, res3;
|
||||
int j;
|
||||
for (j=0; j<nInter; j++) {
|
||||
bool res1;
|
||||
ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
|
||||
if (!res1) break;
|
||||
}
|
||||
if (j<nInter) return ncclSuccess;
|
||||
if (nInter > 0 && inter != nullptr) {
|
||||
ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
|
||||
if (res2) {
|
||||
*isXGMI = true;
|
||||
return ncclSuccess;
|
||||
// try intermediate GPUs
|
||||
if (maxInter) {
|
||||
// check if there are intermediate GPUs that are connected to both
|
||||
bool res1, res2, res3;
|
||||
int j;
|
||||
for (j=0; j<nInter; j++) {
|
||||
ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
|
||||
if (!res1) break;
|
||||
}
|
||||
memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
|
||||
}
|
||||
interGpus[0] = cudaDev1;
|
||||
// add one more intermediate GPU recursively util reaching max depth
|
||||
nInter++;
|
||||
if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
int dev = system->nodes[GPU].nodes[i].gpu.dev;
|
||||
// skip duplicated GPU
|
||||
if (dev == cudaDev2) continue;
|
||||
for (j=0; j<nInter; j++)
|
||||
if (dev == interGpus[j]) break;
|
||||
if (j<nInter) continue;
|
||||
// check connectivity with intermediate GPUs
|
||||
interGpus[nInter] = dev;
|
||||
ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
|
||||
if (res3) {
|
||||
*isXGMI = true;
|
||||
return ncclSuccess;
|
||||
if (j<nInter) return ncclSuccess;
|
||||
if (nInter > 0 && inter != nullptr) {
|
||||
ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
|
||||
if (res2) {
|
||||
*isXGMI = true;
|
||||
return ncclSuccess;
|
||||
}
|
||||
memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
|
||||
}
|
||||
interGpus[0] = cudaDev1;
|
||||
// add one more intermediate GPU recursively util reaching max depth
|
||||
nInter++;
|
||||
if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
int dev = system->nodes[GPU].nodes[i].gpu.dev;
|
||||
// skip duplicated GPU
|
||||
if (dev == cudaDev2) continue;
|
||||
for (j=0; j<nInter; j++)
|
||||
if (dev == interGpus[j]) break;
|
||||
if (j<nInter) continue;
|
||||
// check connectivity with intermediate GPUs
|
||||
interGpus[nInter] = dev;
|
||||
ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
|
||||
if (res3) {
|
||||
*isXGMI = true;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
*isXGMI = false;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -104,6 +104,7 @@ struct ncclTopoLinkList {
|
||||
#define RCCL_TOPO_GDR_ALL 4
|
||||
#define RCCL_TOPO_16P1H 8
|
||||
#define RCCL_TOPO_FORCE_INTRA 16
|
||||
#define RCCL_TOPO_XGMI_ALL 32
|
||||
|
||||
#define RCCL_TOPO_MAX_RANKS_PER_GPU 8
|
||||
struct ncclTopoNode {
|
||||
|
||||
Reference in New Issue
Block a user