diff --git a/src/graph/rome_models.cc b/src/graph/rome_models.cc index 275509a015..a2fa9f9653 100644 --- a/src/graph/rome_models.cc +++ b/src/graph/rome_models.cc @@ -1953,7 +1953,9 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra INFO(NCCL_GRAPH, "%s", line); system->type |= RCCL_TOPO_4P2H_ROME; parseOptions(system, romeTopoModels[i].options); - + if(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")){ + system->tuning = 6; + } // create 4P2H based on reference and remapped ids switch (graph->pattern) { case NCCL_TOPO_PATTERN_RING: @@ -2152,7 +2154,9 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* } INFO(NCCL_GRAPH, "%s", line); parseOptions(system, romeTopoModels[i].options); - + if(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")){ + system->tuning = 6; + } // create 4P2H based on reference and remapped ids switch (graph->pattern) { case NCCL_TOPO_PATTERN_RING: @@ -2339,7 +2343,9 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra INFO(NCCL_GRAPH, "%s", line); system->type |= RCCL_TOPO_16P1H; parseOptions(system, romeTopoModels[i].options); - + if(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")){ + system->tuning = 6; + } // create 16P1H based on reference and remapped ids NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, rdm, nnets > 1 ? n : NULL, false)); diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 468a376442..bf151a1374 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -1,3 +1,4 @@ + /************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. @@ -337,6 +338,45 @@ static struct tuningModel tuning_model_5 { }, }; +static struct tuningModel tuning_model_6 { + .hwLat = { + /* NVLINK */ + { /* Tree (LL/LL128/Simple)*/ { 0.9, 0.9, 2.3 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.8, 2.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.9 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + /* PCI */ + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + /* NET */ + { /* Tree (LL/LL128/Simple)*/ { 10.5, 10.5, 25.0 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 9.5, 320.0 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 10.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + }, + + .bwRatio = { + /* 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.06, 0.06, 0.11 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 0.08, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + /* more than 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.06, 0.06, 0.59 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 0.08, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + }, + + .treeCorrectionFactor = { + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 1.0, 0.9, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 1.0, 0.9, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.7, 0.5, 0.6, 0.6, 0.6, }, + }, + + .ringCorrectionFactor = { + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 1.0, 0.4, 0.4, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 1.0, 0.4, 0.4, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.8, 1.0, 1.0, 1.0, }, + }, + // Follow order in RcclTunableColls + .llProtoRanges = { + /*ReduceScatter*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 65536, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {65536, 4194304, 1, 64}}, + /*AllGather*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 65536, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {65536, 8388608, 1, 64}}, + /*AllReduce*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 262144, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {262144, 30408704, 3145728, 0}}, + }, +}; + static struct tuningModel rcclTuningModel[] = { tuning_model_0, tuning_model_1, @@ -344,6 +384,8 @@ static struct tuningModel rcclTuningModel[] = { tuning_model_3, tuning_model_4, tuning_model_5, + tuning_model_6, + }; /* Array indexes used below */