From 8cf28704ce308fe405c249d3417ebf6e6a7f1a9b Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Fri, 26 Apr 2024 10:54:27 -0500 Subject: [PATCH] prevent segfault from npkit-enabled rccl build Signed-off-by: AtlantaPepsi [ROCm/rccl commit: 67246649ac4a29505172fc285890095ec35561b1] --- projects/rccl/src/init.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 23abcf8aae..d383d4e924 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1208,6 +1208,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p ret = ncclInternalError; goto fail; } + #if defined(ENABLE_NPKIT) + if (intraProcRanks != 1) { + WARN("NPKit currently does not support more than 1 device per process"); + ret = ncclInternalError; + goto fail; + } + #endif struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm; assert(intraProcRank==0 ? comm==comm0 : true); comm->intraComm0 = comm0;