From f7e39fced25b445b26c9bcdf5bc7b59c56fde8f8 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 8 Jan 2024 08:14:33 -0800 Subject: [PATCH] Doubling buffer size to fix NCCL INFO corruption with increased channels (#1035) --- src/debug.cc | 2 +- src/init.cc | 6 +++--- tools/topo_expl/utils.cpp | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/debug.cc b/src/debug.cc index b88fa5982a..763ad1e5d7 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -166,7 +166,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file cudaGetDevice(&cudaDev); } - char buffer[1024]; + char buffer[2048]; size_t len = 0; if (level == NCCL_LOG_WARN) { len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ", diff --git a/src/init.cc b/src/init.cc index cddc7db838..d7942e9f76 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1371,16 +1371,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); - char line[1024]; + char line[2048]; line[0]='\0'; for (int c=0; cnChannels; c++) { struct ncclTree* tree = &comm->channels[c].tree; - snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d", + snprintf(line+strlen(line), 2047-strlen(line), " [%d] %d/%d/%d->%d->%d", c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up); INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId); } - line[1023] = '\0'; + line[2047] = '\0'; INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId); NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail); diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index fd9201699e..7c3547f5b3 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -1025,16 +1025,16 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); - char line[1024]; + char line[2048]; line[0]='\0'; for (int c=0; cnChannels; c++) { struct ncclTree* tree = &comm->channels[c].tree; - snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d", + snprintf(line+strlen(line), 2047-strlen(line), " [%d] %d/%d/%d->%d->%d", c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up); INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId); } - line[1023] = '\0'; + line[2047] = '\0'; INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId); //NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);