diff --git a/ext-tuner/basic/README.md b/ext-tuner/basic/README.md index acc6d55452..08e171045f 100644 --- a/ext-tuner/basic/README.md +++ b/ext-tuner/basic/README.md @@ -36,7 +36,7 @@ ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes - **Purpose**: Modify cost tables for collective operations - **Current Implementation**: - Sets RING+SIMPLE algorithm to cost 0.0 (highest preference) - - Sets channel count to 1 + - Sets channel count to 0 - **Parameters**: - `context`: Plugin context from init - `collType`: Type of collective operation diff --git a/ext-tuner/basic/plugin.c b/ext-tuner/basic/plugin.c index a17fd009e3..08244be1d4 100644 --- a/ext-tuner/basic/plugin.c +++ b/ext-tuner/basic/plugin.c @@ -18,7 +18,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; } - *nChannels = 1; + *nChannels = 0; return ncclSuccess; } diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c index 84fb0c941e..c4af3c6b20 100644 --- a/ext-tuner/example/plugin.c +++ b/ext-tuner/example/plugin.c @@ -330,8 +330,8 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size TunerContext* ctx = (TunerContext*)context; if (!ctx) return ncclInternalError; - // Default channels - *nChannels = 1; + // Set default channels to 0 to ensure RCCL uses its default channel selection logic in case no match is found or wildcard is used in config. + *nChannels = 0; if (ctx->logFunction) { ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, diff --git a/src/enqueue.cc b/src/enqueue.cc index 6c26d7d65e..b2f3ba4f7b 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -2055,6 +2055,9 @@ static ncclResult_t topoGetAlgoInfo( info->protocol = protocol; float time = minTime; + // Tuner plugin sets cost to 0.0 if it finds a match + bool isTunerMatchFound = (comm->tuner != NULL && minTime == 0.0); + // Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case. // coverity[check_after_sink] if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { @@ -2071,7 +2074,10 @@ static ncclResult_t topoGetAlgoInfo( WARN("Error : no algorithm/protocol available for function %s with datatype %s.%s%s", ncclFuncToString(info->func), ncclDatatypeToString(info->datatype), ncclAlgoEnvStr, ncclProtoEnvStr); return (algoEnv || protoEnv) ? ncclInvalidUsage : ncclInternalError; } - rcclUpdateCollectiveProtocol(comm, nBytes, info); + // Honor Tuner config if available + if (!isTunerMatchFound) { + rcclUpdateCollectiveProtocol(comm, nBytes, info); + } rcclSetPipelining(comm, nBytes, info); if (simInfo) simInfo->estimatedTime = time; TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);