Make proxy dump print out meaningful information. (#1504)
* Make proxy dump print out meaningful information. fixed: HPEXA-63 * printout raw data instead.
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
35fe9e06f3
Коммит
6b2b87c9f8
@@ -74,6 +74,9 @@ struct ncclProxyOp {
|
||||
uint8_t* sendbuff;
|
||||
uint8_t* recvbuff;
|
||||
|
||||
int nextRank;
|
||||
int prevRank;
|
||||
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
|
||||
struct ncclProxyOp *enqNext;
|
||||
@@ -145,6 +148,11 @@ struct ncclProxyArgs {
|
||||
struct ncclProxyArgs** proxyAppendPtr;
|
||||
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
|
||||
int prevRank;
|
||||
int nextRank;
|
||||
int send;
|
||||
int retry_total;
|
||||
};
|
||||
#define NCCL_MAX_NETDEVS 128
|
||||
|
||||
|
||||
+40
-16
@@ -244,64 +244,78 @@ ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState*
|
||||
}
|
||||
|
||||
ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) {
|
||||
printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll");
|
||||
int peer = op->send ? op->nextRank : op->prevRank;
|
||||
bool isColl = (op->pattern != ncclPatternRecv) && (op->pattern != ncclPatternSend);
|
||||
|
||||
fprintf(stderr, "%p [%d-%d|%ld| %s",op, poolIndex, opIndex, op->opCount, isColl ? "Coll->" : "");
|
||||
fprintf(stderr, "%s", op->send ? "Send" : "Recv");
|
||||
for (int s=0; s<op->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = op->subs+s;
|
||||
if (op->state == ncclProxyOpProgress) {
|
||||
char status = ' ';
|
||||
if (op->pattern == ncclPatternRecv) {
|
||||
if (op->pattern == ncclPatternRecv) { // ncclRecv
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
|
||||
else if (sub->received < sub->posted) status = 'R'; // Receiving
|
||||
else if (sub->received < sub->transmitted) status = 'R'; // Receiving
|
||||
else if (sub->transmitted < sub->received) status = 'F'; // Flushing
|
||||
else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU
|
||||
else status = 'D'; // Done
|
||||
} else if (op->pattern == ncclPatternSend) {
|
||||
} else if (op->pattern == ncclPatternSend) { //ncclSend
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
|
||||
else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU
|
||||
else if (sub->done < sub->transmitted) status = 'S'; // Sending
|
||||
else status = 'D'; // Done
|
||||
} else {
|
||||
// Send or recv within a collective. Dump raw state data.
|
||||
fprintf(stderr, " nb:%zd ns:%d p:%lu t:%lu r:%lu, d:%lu ",sub->nbytes,sub->nsteps, sub->posted, sub->transmitted, sub->received, sub->done);
|
||||
}
|
||||
printf(" %d%c/%d", sub->peer, status, sub->channelId);
|
||||
fprintf(stderr, "%c peer:%d chan:%d ", status, peer, sub->channelId);
|
||||
} else {
|
||||
printf(" %d/%d", sub->peer, sub->channelId);
|
||||
if (op->state == ncclProxyOpNone) fprintf(stderr, "\t[]");
|
||||
else if (op->state == ncclProxyOpReady) fprintf(stderr, "\t[R]");
|
||||
else fprintf(stderr, "\t[UNDEFINED]");
|
||||
fprintf(stderr, " peer:%d channel:%d", peer, sub->channelId);
|
||||
}
|
||||
}
|
||||
printf("]");
|
||||
if (op->retry_total > 0) fprintf(stderr, "(retries:%d)", op->retry_total);
|
||||
fprintf(stderr, "]\n");
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
|
||||
struct ncclProxyArgs* op = state->active;
|
||||
int poolIndex, opIndex;
|
||||
printf("ACTIVE OPS\n");
|
||||
int list_len = 0;
|
||||
int sublist_len = 0;
|
||||
fprintf(stderr, "ACTIVE OPS\n");
|
||||
while (op) {
|
||||
sublist_len = 0;
|
||||
NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
|
||||
if (op->state & OP_SEEN) {
|
||||
WARN("List loop at element %d-%d", poolIndex, opIndex);
|
||||
}
|
||||
NCCLCHECK(printProxyOp(op, poolIndex, opIndex));
|
||||
op->state |= OP_SEEN;
|
||||
printf("\n");
|
||||
struct ncclProxyArgs* nextOp = op->nextPeer;
|
||||
while (nextOp) {
|
||||
sublist_len++;
|
||||
NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
|
||||
if (nextOp->state & OP_SEEN) {
|
||||
WARN("List loop at element %d-%d", poolIndex, opIndex);
|
||||
}
|
||||
printf("| `-> ");
|
||||
fprintf(stderr, "| `-> ");
|
||||
NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex));
|
||||
nextOp->state |= OP_SEEN;
|
||||
printf("\n");
|
||||
if (nextOp->next) {
|
||||
WARN("Inactive op has next set!");
|
||||
}
|
||||
nextOp = nextOp->nextPeer;
|
||||
}
|
||||
if (op->nextPeer == NULL) printf("|\n");
|
||||
if (op->nextPeer == NULL) fprintf(stderr, "|\n");
|
||||
op = op->next;
|
||||
printf("v\n");
|
||||
fprintf(stderr, "v\n");
|
||||
list_len++;
|
||||
}
|
||||
printf("[X]\n");
|
||||
fprintf(stderr, "[%d]\n\n", list_len);
|
||||
|
||||
# if 0
|
||||
printf("FREE OPS\n");
|
||||
@@ -335,11 +349,10 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
|
||||
struct ncclProxyArgs* elem = pool->elems;
|
||||
for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++, elem++) {
|
||||
if ((elem->state & OP_SEEN) == 0) {
|
||||
printf("Elem %d-%d is not in any list:\n", poolIndex, e);
|
||||
fprintf(stderr, "Elem %d-%d is not in any list:\n", poolIndex, e);
|
||||
NCCLCHECK(printProxyOp(elem, poolIndex, e));
|
||||
printf("\n");
|
||||
} else {
|
||||
elem->state -= OP_SEEN;
|
||||
elem->state &= ~OP_SEEN;
|
||||
}
|
||||
}
|
||||
pool = pool->next;
|
||||
@@ -398,6 +411,10 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
args->state = ncclProxyOpReady;
|
||||
args->progress = op->connection->tcomm->proxyProgress;
|
||||
args->proxyAppendPtr = op->connection->proxyAppendPtr;
|
||||
args->send = op->connection->send;
|
||||
args->prevRank = op->prevRank;
|
||||
args->nextRank = op->nextRank;
|
||||
args->retry_total = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -549,9 +566,13 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
|
||||
case ncclPatternPipelineTo: {
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
|
||||
op->prevRank = ring->prev;
|
||||
op->nextRank = ring->next;
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
|
||||
}
|
||||
if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
|
||||
op->prevRank = ring->prev;
|
||||
op->nextRank = ring->next;
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, op->connIndex, justInquire));
|
||||
}
|
||||
} break;
|
||||
@@ -626,6 +647,7 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr
|
||||
}
|
||||
}
|
||||
freeOp->next = state->pool;
|
||||
freeOp->nextPeer = NULL;
|
||||
state->pool = freeOp;
|
||||
DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
|
||||
#ifdef DEBUG_PROXY
|
||||
@@ -638,6 +660,7 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
|
||||
struct ncclProxyArgs* prevOp = NULL;
|
||||
struct ncclProxyArgs* op = opStart;
|
||||
while (op) {
|
||||
op->retry_total++;
|
||||
if (op->state == ncclProxyOpNone) return ncclInternalError;
|
||||
TIME_START(0); TIME_START(1);
|
||||
NCCLCHECK(op->progress(proxyState, op));
|
||||
@@ -749,6 +772,7 @@ process_nextops:
|
||||
#include <signal.h>
|
||||
static ncclProxyProgressState* ncclLastProxyState;
|
||||
void ncclDumpProxyState(int signal) {
|
||||
fprintf(stderr, "received signal %d...\n", signal);
|
||||
dumpProxyState(ncclLastProxyState);
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user