proxy: handle progressOps return code properly. (#2029)

[ROCm/rccl commit: d6a53d2022]
Этот коммит содержится в:
Arm Patinyasakdikul
2025-11-04 07:09:50 -08:00
коммит произвёл GitHub
родитель 4babb01f4d
Коммит 25005c1cce
+4 -1
Просмотреть файл
@@ -779,6 +779,7 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr
static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
struct ncclProxyArgs* prevOp = NULL;
struct ncclProxyArgs* op = opStart;
ncclResult_t status = ncclSuccess;
while (op) {
op->retry_total++;
if (op->state == ncclProxyOpNone) return ncclInternalError;
@@ -787,6 +788,8 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
*idle &= op->idle;
if (op->state == ncclProxyOpNone || ret != ncclSuccess) {
//track first error that occured
if (ret != ncclSuccess && status == ncclSuccess) status = ret;
TIME_START(2);
NCCLCHECK(removeOp(state, &op, &prevOp));
TIME_STOP(2);
@@ -795,7 +798,7 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
op = op->next;
}
}
return ncclSuccess;
return status;
}
NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);