Update tuning parameters
Bu işleme şunda yer alıyor:
@@ -451,8 +451,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
|
||||
if (info->coll == ncclFuncAllToAllPivot) {
|
||||
int pivotA2ANumUniRings = comm->topo->pivotA2ANumBiRings * 2;
|
||||
info->nChannels = comm->nChannels / pivotA2ANumUniRings * pivotA2ANumUniRings;
|
||||
} else if (comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->nChannels == 32 && comm->nRanks/comm->nNodes == 16 && info->nBytes >= 268435456
|
||||
&& ((comm->nNodes > 2 && info->nBytes <= 2147483648) || (comm->nNodes == 2 && info->nBytes <= 1073741824))) {
|
||||
} else if (comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->topo->tuning == 4 &&
|
||||
((comm->nNodes == 2 && info->nBytes == 33554432) || (comm->nNodes <= 4 && info->nBytes == 67108864))) {
|
||||
static int userTuneInput = -2;
|
||||
if (userTuneInput == -2) {
|
||||
const char *protoStr = getenv("NCCL_PROTO");
|
||||
|
||||
@@ -432,7 +432,7 @@ static struct rcclRomeModel rome_model_65 = {
|
||||
.gdrLevel = { PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, },
|
||||
.pattern = "42424242",
|
||||
.ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
|
||||
.options = "netGdrLevel=PHB,tuning=1",
|
||||
.options = "netGdrLevel=PHB,tuning=4",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_66 = {
|
||||
|
||||
@@ -188,11 +188,42 @@ static struct tuningModel tuning_model_3 {
|
||||
},
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_4 {
|
||||
.hwLat = {
|
||||
/* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 32.4, 32.4, 24.6 }, /* Ring (LL/LL128/Simple)*/ { 5.5, 5.5, 13.3 }, /* CollNet (LL/LL128/Simple)*/ { 32.4, 32.4, 24.6 } },
|
||||
},
|
||||
|
||||
.bwRatio = {
|
||||
/* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.62 }, /* Ring (LL/LL128/Simple)*/ { 0.10, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.31 }, /* Ring (LL/LL128/Simple)*/ { 0.10, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } },
|
||||
},
|
||||
|
||||
.treeCorrectionFactor = {
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.8, 1.0, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.8, 1.0, 1.0, 1.0, 0.8, 0.6, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.8, 1.0, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.8, 1.0, 1.0, 1.0, 0.8, 0.6, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.2, 1.0, 0.9, 0.1, 0.6, 0.2, 0.6, 0.6, 0.4, 0.8, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 1.0, 0.6, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, },
|
||||
},
|
||||
|
||||
.ringCorrectionFactor = {
|
||||
{ 1.0, 0.1, 0.2, 1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.6, 0.8, 0.8, 0.8, 0.6, 0.5, 0.6, 0.5, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 1.0, 0.1, 0.2, 1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.6, 0.8, 0.8, 0.8, 0.6, 0.5, 0.6, 0.5, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 1.0, 0.1, 0.1, 0.1, 0.5, 0.5, 0.4, 0.5, 0.3, 0.2, 0.3, 0.1, 0.1, 0.2, 0.4, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
|
||||
},
|
||||
};
|
||||
|
||||
static struct tuningModel rcclTuningModel[] = {
|
||||
tuning_model_0,
|
||||
tuning_model_1,
|
||||
tuning_model_2,
|
||||
tuning_model_3,
|
||||
tuning_model_4,
|
||||
};
|
||||
|
||||
// LL128 max BW per channel
|
||||
@@ -422,9 +453,9 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
#else
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||
#endif
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
|
||||
#endif
|
||||
// Tree pipelining saves latency in aggregation cases
|
||||
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
|
||||
*time = lat * latCount + (info->nBytes) / (1000 * bw);
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle