proxy: handle progressOps return code properly. (#2029)
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
07f8f6d6c6
Коммит
d6a53d2022
+4
-1
@@ -779,6 +779,7 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr
|
||||
static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
|
||||
struct ncclProxyArgs* prevOp = NULL;
|
||||
struct ncclProxyArgs* op = opStart;
|
||||
ncclResult_t status = ncclSuccess;
|
||||
while (op) {
|
||||
op->retry_total++;
|
||||
if (op->state == ncclProxyOpNone) return ncclInternalError;
|
||||
@@ -787,6 +788,8 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
|
||||
if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
|
||||
*idle &= op->idle;
|
||||
if (op->state == ncclProxyOpNone || ret != ncclSuccess) {
|
||||
//track first error that occured
|
||||
if (ret != ncclSuccess && status == ncclSuccess) status = ret;
|
||||
TIME_START(2);
|
||||
NCCLCHECK(removeOp(state, &op, &prevOp));
|
||||
TIME_STOP(2);
|
||||
@@ -795,7 +798,7 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
|
||||
op = op->next;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
return status;
|
||||
}
|
||||
|
||||
NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
|
||||
|
||||
Ссылка в новой задаче
Block a user