Fix protocol and channel override when tuner is used (#1985)
* Fix protocol and channel override when tuner is used * Added comment * Fix README for basic tuner implementation
Этот коммит содержится в:
@@ -36,7 +36,7 @@ ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes
|
||||
- **Purpose**: Modify cost tables for collective operations
|
||||
- **Current Implementation**:
|
||||
- Sets RING+SIMPLE algorithm to cost 0.0 (highest preference)
|
||||
- Sets channel count to 1
|
||||
- Sets channel count to 0
|
||||
- **Parameters**:
|
||||
- `context`: Plugin context from init
|
||||
- `collType`: Type of collective operation
|
||||
|
||||
@@ -18,7 +18,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
|
||||
if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
|
||||
table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
|
||||
}
|
||||
*nChannels = 1;
|
||||
*nChannels = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -330,8 +330,8 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
if (!ctx) return ncclInternalError;
|
||||
|
||||
// Default channels
|
||||
*nChannels = 1;
|
||||
// Set default channels to 0 to ensure RCCL uses its default channel selection logic in case no match is found or wildcard is used in config.
|
||||
*nChannels = 0;
|
||||
|
||||
if (ctx->logFunction) {
|
||||
ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
|
||||
|
||||
@@ -2055,6 +2055,9 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
info->protocol = protocol;
|
||||
float time = minTime;
|
||||
|
||||
// Tuner plugin sets cost to 0.0 if it finds a match
|
||||
bool isTunerMatchFound = (comm->tuner != NULL && minTime == 0.0);
|
||||
|
||||
// Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
|
||||
// coverity[check_after_sink]
|
||||
if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
|
||||
@@ -2071,7 +2074,10 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
WARN("Error : no algorithm/protocol available for function %s with datatype %s.%s%s", ncclFuncToString(info->func), ncclDatatypeToString(info->datatype), ncclAlgoEnvStr, ncclProtoEnvStr);
|
||||
return (algoEnv || protoEnv) ? ncclInvalidUsage : ncclInternalError;
|
||||
}
|
||||
rcclUpdateCollectiveProtocol(comm, nBytes, info);
|
||||
// Honor Tuner config if available
|
||||
if (!isTunerMatchFound) {
|
||||
rcclUpdateCollectiveProtocol(comm, nBytes, info);
|
||||
}
|
||||
rcclSetPipelining(comm, nBytes, info);
|
||||
if (simInfo) simInfo->estimatedTime = time;
|
||||
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
|
||||
|
||||
Ссылка в новой задаче
Block a user