Single-node AllGather and ReduceScatter Optimization (#2019)
* Single-node performance tuning
* Normalizing value to individual rank
[ROCm/rccl commit: 63d5846452]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
54194a17c3
Коммит
3e750f0f57
@@ -38,14 +38,20 @@ RCCL_PARAM(DirectAllGatherThreshold, "DIRECT_ALLGATHER_THRESHOLD", 75497472);
|
||||
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) {
|
||||
// Honor user input for protocol choice
|
||||
static int userProtocolInput = -2;
|
||||
size_t sizePerRank = rcclGetSizePerRank(info->func, nBytes, comm->nRanks);
|
||||
if (userProtocolInput == -2) {
|
||||
const char *protoStr = getenv("NCCL_PROTO");
|
||||
userProtocolInput = !protoStr ? 0 : 1;
|
||||
}
|
||||
if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncAllGather) && nBytes <= 524288) {
|
||||
if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncAllGather) && sizePerRank <= 88448) {
|
||||
// Change LL protocol threshold
|
||||
info->protocol = NCCL_PROTO_LL;
|
||||
} else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncReduceScatter) && sizePerRank <= 175488) {
|
||||
// Change LL protocol threshold
|
||||
info->protocol = NCCL_PROTO_LL;
|
||||
} else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && comm->nNodes == 1 && (info->func == ncclFuncReduceScatter) && sizePerRank <= 352128) {
|
||||
// Change LL protocol threshold
|
||||
info->protocol = NCCL_PROTO_LL;
|
||||
|
||||
} else if(!userProtocolInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather || info->func == ncclFuncAllReduce || info->func == ncclFuncBroadcast || info->func == ncclFuncReduce)) {
|
||||
auto tunableIndex = rcclGetTunableIndex(info->func);
|
||||
auto llMin = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX];
|
||||
|
||||
Ссылка в новой задаче
Block a user