diff --git a/src/proxy.cc b/src/proxy.cc index 90b97b180c..f70e393e56 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -779,6 +779,7 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { struct ncclProxyArgs* prevOp = NULL; struct ncclProxyArgs* op = opStart; + ncclResult_t status = ncclSuccess; while (op) { op->retry_total++; if (op->state == ncclProxyOpNone) return ncclInternalError; @@ -787,6 +788,8 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; if (op->state == ncclProxyOpNone || ret != ncclSuccess) { + //track first error that occured + if (ret != ncclSuccess && status == ncclSuccess) status = ret; TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); TIME_STOP(2); @@ -795,7 +798,7 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr op = op->next; } } - return ncclSuccess; + return status; } NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);