Doubling buffer size to fix NCCL INFO corruption with increased channels (#1035)
Этот коммит содержится в:
+1
-1
@@ -166,7 +166,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
cudaGetDevice(&cudaDev);
|
||||
}
|
||||
|
||||
char buffer[1024];
|
||||
char buffer[2048];
|
||||
size_t len = 0;
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
|
||||
|
||||
+3
-3
@@ -1371,16 +1371,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
|
||||
|
||||
char line[1024];
|
||||
char line[2048];
|
||||
line[0]='\0';
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclTree* tree = &comm->channels[c].tree;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
|
||||
snprintf(line+strlen(line), 2047-strlen(line), " [%d] %d/%d/%d->%d->%d",
|
||||
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
|
||||
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev,
|
||||
comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
line[2047] = '\0';
|
||||
INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
|
||||
|
||||
NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
|
||||
|
||||
@@ -1025,16 +1025,16 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
|
||||
|
||||
char line[1024];
|
||||
char line[2048];
|
||||
line[0]='\0';
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclTree* tree = &comm->channels[c].tree;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
|
||||
snprintf(line+strlen(line), 2047-strlen(line), " [%d] %d/%d/%d->%d->%d",
|
||||
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
|
||||
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev,
|
||||
comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
line[2047] = '\0';
|
||||
INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
|
||||
|
||||
//NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
|
||||
|
||||
Ссылка в новой задаче
Block a user