From 364fd6e6831aaf4db9e280ef9b1fd50e6b9bfec6 Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Tue, 22 Apr 2025 13:55:13 -0700 Subject: [PATCH] NCCL 2.26.5-1 Work around a potential hang in alltoall-like communication patterns on MNNVL systems at a scale of over 80 ranks. [ROCm/rccl commit: 3000e3c797b4b236221188c07aa09c1f3a0170d4] --- projects/rccl/makefiles/version.mk | 2 +- projects/rccl/src/init.cc | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index 93a71d49d8..c5ed6ab708 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 26 -NCCL_PATCH := 3 +NCCL_PATCH := 5 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 46b02e65e1..47d7fa3c66 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -271,7 +271,7 @@ NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0); // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); #define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20) -NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT); +NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", -1); NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX); enum ncclLaunchMode ncclParamLaunchMode; @@ -458,12 +458,22 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { if (ccEnable) { comm->workFifoBytes = 0; } else { - comm->workFifoBytes = ncclParamWorkFifoBytes(); - if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) { - WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes); - comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; + int64_t workFifoBytesParam = ncclParamWorkFifoBytes(); + if (workFifoBytesParam == -1) { + if (comm->MNNVL && (comm->compCap >= 100)) { + // WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL + INFO(NCCL_INIT, "Disabling work fifo"); + comm->workFifoBytes = 0; + } else { + comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; + } + } else { + if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) { + WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam); + comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; + } + comm->workFifoBytes = std::min(workFifoBytesParam, 1ul<<30); } - comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30); } if (comm->rank == 0) {