|
|
|
@@ -54,7 +54,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
|
|
|
|
|
|
|
|
|
|
// Latencies in us, Bandwidths in GB/s
|
|
|
|
|
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
|
|
|
|
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 39.0, 39.0, 49.0 }, { 21.0, 21.0, 30.0 }, { 37.9, 37.9, 40.4 } };
|
|
|
|
|
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 } };
|
|
|
|
|
|
|
|
|
|
// NVLink, PCI, Network
|
|
|
|
|
#define NCCL_HW_NVLINK 0
|
|
|
|
@@ -63,11 +63,11 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 39.0
|
|
|
|
|
// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
|
|
|
|
|
static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
|
|
|
|
{ /* NVLINK */
|
|
|
|
|
{ /* Tree (LL/LL128/Simple)*/ { 2.5, 2.5, 5.5 }, /* Ring (LL/LL128/Simple)*/ { 2.5, 2.5, 5 }, /* CollNet (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 } },
|
|
|
|
|
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } },
|
|
|
|
|
/* PCI */
|
|
|
|
|
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
|
|
|
|
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
|
|
|
|
/* NET */
|
|
|
|
|
{ /* Tree (LL/LL128/Simple)*/ { 28.0, 28.0, 66.0 }, /* Ring (LL/LL128/Simple)*/ { 8.5, 8.5, 19.0 }, /* CollNet (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 } }
|
|
|
|
|
{ /* Tree (LL/LL128/Simple)*/ { 40.0, 40.0, 50.0 }, /* Ring (LL/LL128/Simple)*/ { 4.0, 4.0, 25.0 }, /* CollNet (LL/LL128/Simple)*/ { 40.0, 40.0, 50.0 } }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// LL128 max BW (per channel) for the different collectives
|
|
|
|
@@ -76,7 +76,7 @@ static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15
|
|
|
|
|
static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
|
|
|
|
|
static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} };
|
|
|
|
|
|
|
|
|
|
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
|
|
|
|
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, int gcn) {
|
|
|
|
|
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
|
|
|
|
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
|
|
|
|
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
|
|
|
@@ -133,14 +133,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|
|
|
|
|
|
|
|
|
// Various model refinements
|
|
|
|
|
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
|
|
|
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0;
|
|
|
|
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
|
|
|
|
double maxTreeBw = comm->nNodes > 2 ?
|
|
|
|
|
compCap80 && p == NCCL_PROTO_LL128 ? 105.0 : 80.0 :
|
|
|
|
|
compCap80 && p == NCCL_PROTO_LL128 ? 130.0 : 110.0;
|
|
|
|
|
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
|
|
|
|
|
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
|
|
|
|
|
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
|
|
|
|
|
if (a == NCCL_ALGO_RING && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.05;
|
|
|
|
|
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE) (nNodes == 2) ? busBw *= 0.33 : busBw *= 0.11;
|
|
|
|
|
if (a == NCCL_ALGO_TREE && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.04;
|
|
|
|
|
if (gcn == 910 && p == NCCL_PROTO_LL && nNodes == 1 && nRanks == 16) busBw *= 5.9;
|
|
|
|
|
#else
|
|
|
|
|
if (compCap80) busBw = std::min(busBw, 235.0f);
|
|
|
|
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
|
|
|
@@ -159,7 +155,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|
|
|
|
comm->latencies[coll][a][p] = baseLat[a][p];
|
|
|
|
|
float intraLat = hwLat[intraHw[a]][a][p];
|
|
|
|
|
float interLat = hwLat[NCCL_HW_NET][a][p];
|
|
|
|
|
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
|
|
|
|
//if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
|
|
|
|
if (a == NCCL_ALGO_RING) {
|
|
|
|
|
float lat = hwLat[hw[a]][a][p];
|
|
|
|
|
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
|
|
|
|
@@ -285,15 +281,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|
|
|
|
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
|
|
|
|
|
// factor is not ideal but works quite well. Powers of two, 64 B to 128MB.
|
|
|
|
|
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
|
|
|
|
|
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 },
|
|
|
|
|
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 },
|
|
|
|
|
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .41, .27, .25, .39, .46, .72, .76, .87, .92, .97, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }
|
|
|
|
|
{ 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, },
|
|
|
|
|
{ 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, },
|
|
|
|
|
{ 0.4, 0.4, 0.3, 0.3, 0.2, 0.5, 0.5, 0.7, 0.2, 0.2, 0.3, 0.6, 0.7, 1.0, 1.3, 1.0, 1.2, 1.2, 1.1, 1.1, 1.2, 1.2, },
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
|
|
|
|
|
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 },
|
|
|
|
|
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 },
|
|
|
|
|
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .04, .08, .09, .09, .11, .13, .25, .40, .59, .76, .86, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }
|
|
|
|
|
{ 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, },
|
|
|
|
|
{ 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, },
|
|
|
|
|
{ 0.6, 0.4, 0.4, 0.4, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.6, 0.8, 0.9, },
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) {
|
|
|
|
|