all_reduce LL/LL128 and Ring/Tree multi-node tuning for MI300 (#1627)
* Enabling LL128 by default on MI300 * Add missing CUDACHECK * Adjust BW correction factors to fix the Tree->Ring switching point * Refactor and add ll128 AR logarithmic factor to tuning models * Move RCCL tuning changes to a separate file * Use enum for tunable indexing * Use explicit indexing in tuning models to avoid mismatch issues * Place rcclGetSizePerRank in a function * Remove HIP ifdef for rccl-only call --------- Co-authored-by: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
5b36b68d06
Коммит
e40ff4f84a
@@ -422,6 +422,7 @@ set(SRC_FILES
|
||||
src/net.cc
|
||||
src/msccl.cc
|
||||
src/proxy.cc
|
||||
src/rccl_wrap.cc
|
||||
src/register.cc
|
||||
src/transport.cc
|
||||
src/device/all_gather.h
|
||||
@@ -498,6 +499,7 @@ set(SRC_FILES
|
||||
src/include/param.h
|
||||
src/include/profiler.h
|
||||
src/include/proxy.h
|
||||
src/include/rccl_common.h
|
||||
src/include/rccl_vars.h
|
||||
src/include/register.h
|
||||
src/include/rccl_float8.h
|
||||
|
||||
+1
-46
@@ -1938,55 +1938,10 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
info->protocol = backupProto;
|
||||
time = backupTime;
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
// Honor user input for protocol choice
|
||||
static int userProtocolInput = -2;
|
||||
if (userProtocolInput == -2) {
|
||||
const char *protoStr = getenv("NCCL_PROTO");
|
||||
userProtocolInput = !protoStr ? 0 : 1;
|
||||
}
|
||||
|
||||
if(!userProtocolInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather)) {
|
||||
auto llMin = comm->minMaxLLRange[info->func][NCCL_PROTO_LL][0];
|
||||
auto llMax = comm->minMaxLLRange[info->func][NCCL_PROTO_LL][1];
|
||||
|
||||
auto ll128Min = comm->minMaxLLRange[info->func][NCCL_PROTO_LL128][0];
|
||||
auto ll128Max = comm->minMaxLLRange[info->func][NCCL_PROTO_LL128][1];
|
||||
|
||||
// Only override model choices if min/max cutoff points are set in the tuning models
|
||||
if((ll128Max != RCCL_LL_LIMITS_UNDEFINED) || (llMax != RCCL_LL_LIMITS_UNDEFINED)) {
|
||||
// Keep it simple unless otherwise required
|
||||
info->protocol = NCCL_PROTO_SIMPLE;
|
||||
// Normalize the comparison to sizePerRank as this is essentially what matters in determining protocol choice
|
||||
size_t sizePerRank = nBytes / comm->nRanks;
|
||||
|
||||
if(sizePerRank <= llMax && sizePerRank > llMin) {
|
||||
info->protocol = NCCL_PROTO_LL;
|
||||
}
|
||||
#if defined(ENABLE_LL128)
|
||||
// When applicable, LL128 RS performance is better than LL, so the next condition overrides the previous LL choice
|
||||
if(comm->topo->ll128Enabled) {
|
||||
if(sizePerRank <= ll128Max && sizePerRank > ll128Min) {
|
||||
info->protocol = NCCL_PROTO_LL128;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) {
|
||||
// Warn that model detection for MI300 (or future others) did not work as expected
|
||||
// Add supported archs to this condition as they come (e.g. gfx950)
|
||||
// Also make sure the tuning_model and model detection are updated for new archs
|
||||
static bool failedWarn = false;
|
||||
if (!failedWarn) {
|
||||
WARN("LL cutoff points not detected for a supported arch %s", comm->topo->nodes[GPU].nodes[0].gpu.gcn);
|
||||
failedWarn = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rcclUpdateCollectiveProtocol(comm, nBytes, info);
|
||||
if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time);
|
||||
if (simInfo) simInfo->estimatedTime = time;
|
||||
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
|
||||
|
||||
int nc = comm->nChannels;
|
||||
int nt = comm->maxThreads[info->algorithm][info->protocol];
|
||||
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
|
||||
|
||||
@@ -814,7 +814,7 @@ static struct rcclRomeModel rome_model_81 = {
|
||||
"N7 7 3 2 6 0 4 1 5 N5|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|",
|
||||
|
||||
.options = "noCpuCheck=1,tuning=5,disableNumaMatching=1",
|
||||
.options = "noCpuCheck=1,tuning=5,ll128Enabled=1,disableNumaMatching=1",
|
||||
|
||||
.treeRail = "N0 0 1 2 4 3 6 5 7 N1|"
|
||||
"N1 1 0 4 7 3 5 2 6 N0|"
|
||||
|
||||
@@ -123,12 +123,6 @@ struct ncclTopoLinkList {
|
||||
#define RCCL_TOPO_FORCE_INTRA 16
|
||||
#define RCCL_TOPO_XGMI_ALL 32
|
||||
|
||||
#define RCCL_LL_TUNABLE_COLLS 4 // LL/LL64/LL128 tunable Collectives
|
||||
#define RCCL_RS_TUNABLE 0 // reduce_scatter index
|
||||
#define RCCL_AG_TUNABLE 1 // all_gather index
|
||||
#define RCCL_AR_TUNABLE 2 // all_reduce index
|
||||
#define RCCL_RE_TUNABLE 3 // reduce index
|
||||
#define RCCL_LL_LIMITS_UNDEFINED 0
|
||||
|
||||
#define GCN_ARCH_NAME_LEN 16
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ struct tuningModel {
|
||||
float bwRatio [2][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
|
||||
float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
|
||||
uint64_t llProtoRanges[RCCL_LL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][2];
|
||||
uint64_t llProtoRanges[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_0 {
|
||||
@@ -254,19 +254,18 @@ static struct tuningModel tuning_model_5 {
|
||||
.treeCorrectionFactor = {
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 1.0, 0.9, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 1.0, 0.9, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.7, 0.5, 0.6, 0.6, 0.6, },
|
||||
},
|
||||
|
||||
.ringCorrectionFactor = {
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 1.0, 0.4, 0.4, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 1.0, 0.4, 0.4, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.8, 1.0, 1.0, 1.0, },
|
||||
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.8, 1.0, 1.0, 1.0, },
|
||||
},
|
||||
|
||||
.llProtoRanges = {
|
||||
/*ReduceScatter*/ {/* LL (Min/Max) */ {0, 655360} , /* LL128 (Min/Max) */ {131072, 3211264}},
|
||||
/*AllGather*/ {/* LL (Min/Max) */ {0, 98304} , /* LL128 (Min/Max) */ {98304, 5046272}},
|
||||
},
|
||||
.llProtoRanges[RCCL_RS_TUNABLE] = /*ReduceScatter*/ {/* LL (Min/Max) */ {0, 655360, 1} , /* LL128 (Min/Max) */ {131072, 3211264, 1}},
|
||||
.llProtoRanges[RCCL_AG_TUNABLE] = /*AllGather*/ {/* LL (Min/Max) */ {0, 98304, 1} , /* LL128 (Min/Max) */ {98304, 5046272, 1}},
|
||||
.llProtoRanges[RCCL_AR_TUNABLE] = /*AllReduce*/ {/* LL (Min/Max) */ {0, 1048576, 1} , /* LL128 (Min/Max) */ {1048576, 9437184, 3145728}},
|
||||
};
|
||||
|
||||
static struct tuningModel rcclTuningModel[] = {
|
||||
@@ -372,13 +371,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
|
||||
memcpy(comm->minMaxLLRange[ncclFuncReduceScatter],
|
||||
rcclTuningModel[comm->topo->tuning].llProtoRanges[RCCL_RS_TUNABLE],
|
||||
sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges[RCCL_RS_TUNABLE]));
|
||||
|
||||
memcpy(comm->minMaxLLRange[ncclFuncAllGather],
|
||||
rcclTuningModel[comm->topo->tuning].llProtoRanges[RCCL_AG_TUNABLE],
|
||||
sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges[RCCL_AG_TUNABLE]));
|
||||
memcpy(comm->minMaxLLRange,
|
||||
rcclTuningModel[comm->topo->tuning].llProtoRanges,
|
||||
sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges));
|
||||
|
||||
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
|
||||
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "graph.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "profiler.h"
|
||||
#include "rccl_common.h"
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
#define HIPRT_CB
|
||||
@@ -517,7 +518,7 @@ struct ncclComm {
|
||||
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
|
||||
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
uint64_t minMaxLLRange[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS - 1][2];
|
||||
uint64_t minMaxLLRange[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
|
||||
|
||||
/* This attribute can indicate the states of communicators and return code of
|
||||
* asynchronous NCCL operations. */
|
||||
@@ -710,7 +711,7 @@ inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
|
||||
}
|
||||
}
|
||||
finish:
|
||||
cudaThreadExchangeStreamCaptureMode(&mode);
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef RCCL_COMMON_H_
|
||||
#define RCCL_COMMON_H_
|
||||
#include "nccl_common.h"
|
||||
|
||||
typedef enum RcclTunableColls {
|
||||
RCCL_UNSUPPORTED_TUNABLE = -1,
|
||||
RCCL_RS_TUNABLE = 0, // reduce_scatter index
|
||||
RCCL_AG_TUNABLE = 1, // all_gather index
|
||||
RCCL_AR_TUNABLE = 2, // all_reduce index
|
||||
RCCL_RE_TUNABLE = 3, // reduce index
|
||||
RCCL_TUNABLE_COLLS = 4 // LL/LL64/LL128 tunable collectives count
|
||||
} rcclTunableIndex_t;
|
||||
|
||||
#define RCCL_LL_LIMITS_UNDEFINED 0
|
||||
#define RCCL_PROTOCOL_ENTRY_SIZE 3
|
||||
#define RCCL_PROTOCOL_MIN_IDX 0
|
||||
#define RCCL_PROTOCOL_MAX_IDX 1
|
||||
#define RCCL_PROTOCOL_FACTOR_IDX 2
|
||||
|
||||
inline rcclTunableIndex_t rcclGetTunableIndex(ncclFunc_t const& func) {
|
||||
switch (func) {
|
||||
case ncclFuncReduceScatter:
|
||||
return RCCL_RS_TUNABLE;
|
||||
case ncclFuncAllGather:
|
||||
return RCCL_AG_TUNABLE;
|
||||
case ncclFuncAllReduce:
|
||||
return RCCL_AR_TUNABLE;
|
||||
case ncclFuncReduce:
|
||||
return RCCL_RE_TUNABLE;
|
||||
default:
|
||||
return RCCL_UNSUPPORTED_TUNABLE; // Invalid or unsupported function
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, int const& nRanks) {
|
||||
// Normalize the comparison to sizePerRank as this is essentially what matters in determining protocol choice for the impacted collectives
|
||||
// For AG, this is the send size per rank
|
||||
// For RS, this is the recv size per rank
|
||||
// For AR, this is the send/recv size per rank
|
||||
return (func == ncclFuncReduceScatter || func == ncclFuncAllGather) ? nBytes / nRanks : nBytes;
|
||||
}
|
||||
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info);
|
||||
#endif
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rccl_common.h"
|
||||
#include "comm.h"
|
||||
#include "graph/topo.h"
|
||||
|
||||
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) {
|
||||
// Honor user input for protocol choice
|
||||
static int userProtocolInput = -2;
|
||||
if (userProtocolInput == -2) {
|
||||
const char *protoStr = getenv("NCCL_PROTO");
|
||||
userProtocolInput = !protoStr ? 0 : 1;
|
||||
}
|
||||
|
||||
if(!userProtocolInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather || info->func == ncclFuncAllReduce)) {
|
||||
auto tunableIndex = rcclGetTunableIndex(info->func);
|
||||
auto llMin = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX];
|
||||
auto llMax = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX];
|
||||
|
||||
auto ll128Min = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_MIN_IDX];
|
||||
auto ll128Max = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_MAX_IDX];
|
||||
|
||||
// Only override model choices if min/max cutoff points are set in the tuning models
|
||||
if ((ll128Max != RCCL_LL_LIMITS_UNDEFINED) || (llMax != RCCL_LL_LIMITS_UNDEFINED)) {
|
||||
// Keep it simple unless otherwise required
|
||||
info->protocol = NCCL_PROTO_SIMPLE;
|
||||
size_t sizePerRank = rcclGetSizePerRank(info->func, nBytes, comm->nRanks);
|
||||
if (sizePerRank <= llMax && sizePerRank > llMin) {
|
||||
info->protocol = NCCL_PROTO_LL;
|
||||
}
|
||||
#if defined(ENABLE_LL128)
|
||||
// When LL128 is performant, the next condition overrides the previous LL choice
|
||||
if (comm->topo->ll128Enabled) {
|
||||
if (info->func == ncclFuncAllReduce) {
|
||||
ll128Max += (log2i(comm->nNodes) - 1) * comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX];
|
||||
}
|
||||
if (sizePerRank <= ll128Max && sizePerRank > ll128Min) {
|
||||
info->protocol = NCCL_PROTO_LL128;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) {
|
||||
// Warn that model detection for MI300 (or future others) did not work as expected
|
||||
// Add supported archs to this condition as they come (e.g. gfx950)
|
||||
// Also make sure the tuning_model and model detection are updated for new archs
|
||||
static bool failedWarn = false;
|
||||
if (!failedWarn) {
|
||||
WARN("LL cutoff points not detected for a supported arch %s", comm->topo->nodes[GPU].nodes[0].gpu.gcn);
|
||||
failedWarn = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ссылка в новой задаче
Block a user