added tuning table for gfx950 (#1779)
* added tuning table for mi350 * remove erroneous string
Этот коммит содержится в:
@@ -1953,7 +1953,9 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
parseOptions(system, romeTopoModels[i].options);
|
||||
|
||||
if(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")){
|
||||
system->tuning = 6;
|
||||
}
|
||||
// create 4P2H based on reference and remapped ids
|
||||
switch (graph->pattern) {
|
||||
case NCCL_TOPO_PATTERN_RING:
|
||||
@@ -2152,7 +2154,9 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
parseOptions(system, romeTopoModels[i].options);
|
||||
|
||||
if(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")){
|
||||
system->tuning = 6;
|
||||
}
|
||||
// create 4P2H based on reference and remapped ids
|
||||
switch (graph->pattern) {
|
||||
case NCCL_TOPO_PATTERN_RING:
|
||||
@@ -2339,7 +2343,9 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
system->type |= RCCL_TOPO_16P1H;
|
||||
parseOptions(system, romeTopoModels[i].options);
|
||||
|
||||
if(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")){
|
||||
system->tuning = 6;
|
||||
}
|
||||
|
||||
// create 16P1H based on reference and remapped ids
|
||||
NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, rdm, nnets > 1 ? n : NULL, false));
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
@@ -337,6 +338,45 @@ static struct tuningModel tuning_model_5 {
|
||||
},
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_6 {
|
||||
.hwLat = {
|
||||
/* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.9, 0.9, 2.3 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.8, 2.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.9 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 10.5, 10.5, 25.0 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 9.5, 320.0 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 10.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
},
|
||||
|
||||
.bwRatio = {
|
||||
/* 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.06, 0.06, 0.11 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 0.08, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
/* more than 2 nodes */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 0.06, 0.06, 0.59 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 0.08, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } },
|
||||
},
|
||||
|
||||
.treeCorrectionFactor = {
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 1.0, 0.9, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 1.0, 0.9, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.7, 0.5, 0.6, 0.6, 0.6, },
|
||||
},
|
||||
|
||||
.ringCorrectionFactor = {
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 1.0, 0.4, 0.4, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 1.0, 0.4, 0.4, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.8, 1.0, 1.0, 1.0, },
|
||||
},
|
||||
// Follow order in RcclTunableColls
|
||||
.llProtoRanges = {
|
||||
/*ReduceScatter*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 65536, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {65536, 4194304, 1, 64}},
|
||||
/*AllGather*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 65536, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {65536, 8388608, 1, 64}},
|
||||
/*AllReduce*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 262144, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {262144, 30408704, 3145728, 0}},
|
||||
},
|
||||
};
|
||||
|
||||
static struct tuningModel rcclTuningModel[] = {
|
||||
tuning_model_0,
|
||||
tuning_model_1,
|
||||
@@ -344,6 +384,8 @@ static struct tuningModel rcclTuningModel[] = {
|
||||
tuning_model_3,
|
||||
tuning_model_4,
|
||||
tuning_model_5,
|
||||
tuning_model_6,
|
||||
|
||||
};
|
||||
|
||||
/* Array indexes used below */
|
||||
|
||||
Ссылка в новой задаче
Block a user