Fix protocol and channel override when tuner is used (#1985)

* Fix protocol and channel override when tuner is used

* Added comment

* Fix README for basic tuner implementation
Этот коммит содержится в:
nawrinsu
2025-11-03 13:56:34 -08:00
коммит произвёл GitHub
родитель caffd013f6
Коммит 166268d715
4 изменённых файлов: 11 добавлений и 5 удалений
+1 -1
Просмотреть файл
@@ -36,7 +36,7 @@ ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes
- **Purpose**: Modify cost tables for collective operations
- **Current Implementation**:
- Sets RING+SIMPLE algorithm to cost 0.0 (highest preference)
- Sets channel count to 1
- Sets channel count to 0
- **Parameters**:
- `context`: Plugin context from init
- `collType`: Type of collective operation
+1 -1
Просмотреть файл
@@ -18,7 +18,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
}
*nChannels = 1;
*nChannels = 0;
return ncclSuccess;
}
+2 -2
Просмотреть файл
@@ -330,8 +330,8 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
TunerContext* ctx = (TunerContext*)context;
if (!ctx) return ncclInternalError;
// Default channels
*nChannels = 1;
// Set default channels to 0 to ensure RCCL uses its default channel selection logic in case no match is found or wildcard is used in config.
*nChannels = 0;
if (ctx->logFunction) {
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+7 -1
Просмотреть файл
@@ -2055,6 +2055,9 @@ static ncclResult_t topoGetAlgoInfo(
info->protocol = protocol;
float time = minTime;
// Tuner plugin sets cost to 0.0 if it finds a match
bool isTunerMatchFound = (comm->tuner != NULL && minTime == 0.0);
// Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
// coverity[check_after_sink]
if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
@@ -2071,7 +2074,10 @@ static ncclResult_t topoGetAlgoInfo(
WARN("Error : no algorithm/protocol available for function %s with datatype %s.%s%s", ncclFuncToString(info->func), ncclDatatypeToString(info->datatype), ncclAlgoEnvStr, ncclProtoEnvStr);
return (algoEnv || protoEnv) ? ncclInvalidUsage : ncclInternalError;
}
rcclUpdateCollectiveProtocol(comm, nBytes, info);
// Honor Tuner config if available
if (!isTunerMatchFound) {
rcclUpdateCollectiveProtocol(comm, nBytes, info);
}
rcclSetPipelining(comm, nBytes, info);
if (simInfo) simInfo->estimatedTime = time;
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);