Doubling buffer size to fix NCCL INFO corruption with increased channels (#1035)

Этот коммит содержится в:
Wenkai Du
2024-01-08 08:14:33 -08:00
коммит произвёл GitHub
родитель e5bf56c6d8
Коммит f7e39fced2
3 изменённых файлов: 7 добавлений и 7 удалений
+1 -1
Просмотреть файл
@@ -166,7 +166,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
cudaGetDevice(&cudaDev);
}
char buffer[1024];
char buffer[2048];
size_t len = 0;
if (level == NCCL_LOG_WARN) {
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
+3 -3
Просмотреть файл
@@ -1371,16 +1371,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
char line[1024];
char line[2048];
line[0]='\0';
for (int c=0; c<comm->nChannels; c++) {
struct ncclTree* tree = &comm->channels[c].tree;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
snprintf(line+strlen(line), 2047-strlen(line), " [%d] %d/%d/%d->%d->%d",
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev,
comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId);
}
line[1023] = '\0';
line[2047] = '\0';
INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
+3 -3
Просмотреть файл
@@ -1025,16 +1025,16 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
char line[1024];
char line[2048];
line[0]='\0';
for (int c=0; c<comm->nChannels; c++) {
struct ncclTree* tree = &comm->channels[c].tree;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
snprintf(line+strlen(line), 2047-strlen(line), " [%d] %d/%d/%d->%d->%d",
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev,
comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId);
}
line[1023] = '\0';
line[2047] = '\0';
INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
//NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);