diff --git a/src/enqueue.cc b/src/enqueue.cc index 3b4ccb6ecf..7e8ffb8b64 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -451,8 +451,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i if (info->coll == ncclFuncAllToAllPivot) { int pivotA2ANumUniRings = comm->topo->pivotA2ANumBiRings * 2; info->nChannels = comm->nChannels / pivotA2ANumUniRings * pivotA2ANumUniRings; - } else if (comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->nChannels == 32 && comm->nRanks/comm->nNodes == 16 && info->nBytes >= 268435456 - && ((comm->nNodes > 2 && info->nBytes <= 2147483648) || (comm->nNodes == 2 && info->nBytes <= 1073741824))) { + } else if (comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->topo->tuning == 4 && + ((comm->nNodes == 2 && info->nBytes == 33554432) || (comm->nNodes <= 4 && info->nBytes == 67108864))) { static int userTuneInput = -2; if (userTuneInput == -2) { const char *protoStr = getenv("NCCL_PROTO"); diff --git a/src/graph/rome_models.cc b/src/graph/rome_models.cc index 4b5325060a..135c8f4ee2 100644 --- a/src/graph/rome_models.cc +++ b/src/graph/rome_models.cc @@ -432,7 +432,7 @@ static struct rcclRomeModel rome_model_65 = { .gdrLevel = { PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, }, .pattern = "42424242", .ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6", - .options = "netGdrLevel=PHB,tuning=1", + .options = "netGdrLevel=PHB,tuning=4", }; static struct rcclRomeModel rome_model_66 = { diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index ff04c58444..137386197a 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -188,11 +188,42 @@ static struct tuningModel tuning_model_3 { }, }; +static struct tuningModel tuning_model_4 { + .hwLat = { + /* NVLINK */ + { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } }, + /* PCI */ + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } }, + /* NET */ + { /* Tree (LL/LL128/Simple)*/ { 32.4, 32.4, 24.6 }, /* Ring (LL/LL128/Simple)*/ { 5.5, 5.5, 13.3 }, /* CollNet (LL/LL128/Simple)*/ { 32.4, 32.4, 24.6 } }, + }, + + .bwRatio = { + /* 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.62 }, /* Ring (LL/LL128/Simple)*/ { 0.10, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } }, + /* more than 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.31 }, /* Ring (LL/LL128/Simple)*/ { 0.10, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } }, + }, + + .treeCorrectionFactor = { + { 0.1, 0.1, 0.1, 0.1, 0.8, 1.0, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.8, 1.0, 1.0, 1.0, 0.8, 0.6, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 0.1, 0.1, 0.8, 1.0, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.8, 1.0, 1.0, 1.0, 0.8, 0.6, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.2, 1.0, 0.9, 0.1, 0.6, 0.2, 0.6, 0.6, 0.4, 0.8, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 1.0, 0.6, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, }, + }, + + .ringCorrectionFactor = { + { 1.0, 0.1, 0.2, 1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.6, 0.8, 0.8, 0.8, 0.6, 0.5, 0.6, 0.5, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 1.0, 0.1, 0.2, 1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.6, 0.8, 0.8, 0.8, 0.6, 0.5, 0.6, 0.5, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 1.0, 0.1, 0.1, 0.1, 0.5, 0.5, 0.4, 0.5, 0.3, 0.2, 0.3, 0.1, 0.1, 0.2, 0.4, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, + }, +}; + static struct tuningModel rcclTuningModel[] = { tuning_model_0, tuning_model_1, tuning_model_2, tuning_model_3, + tuning_model_4, }; // LL128 max BW per channel @@ -422,9 +453,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto #else if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels; +#endif if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1 && info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring -#endif // Tree pipelining saves latency in aggregation cases int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS); *time = lat * latCount + (info->nBytes) / (1000 * bw);