From 48821ad0d77ae27c7ae1237278e1e28074a25e81 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Thu, 23 May 2024 15:27:43 -0500 Subject: [PATCH] set MAXCHANNELS to 128 [ROCm/rccl commit: ef442f8f9279145eb59e0028619d74f7e12f077d] --- projects/rccl/src/device/common.h | 4 ++-- projects/rccl/src/enqueue.cc | 2 +- projects/rccl/src/graph/paths.cc | 6 +++--- projects/rccl/src/group.cc | 2 +- projects/rccl/src/include/comm.h | 2 +- projects/rccl/src/include/device.h | 3 ++- projects/rccl/src/transport.cc | 26 +++++++++++++++++++++++--- 7 files changed, 33 insertions(+), 12 deletions(-) diff --git a/projects/rccl/src/device/common.h b/projects/rccl/src/device/common.h index 138a42ce09..acb05c40ac 100644 --- a/projects/rccl/src/device/common.h +++ b/projects/rccl/src/device/common.h @@ -241,7 +241,7 @@ __forceinline__ __device__ void ncclKernelMain(struct ncclDevComm* comm, struct y = __popcll(channelMask.masks[i] & ((1ull<p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? nextPow2(minChannels) : ncclParamNChannelsPerPeer()); // Doubling P2P channels per peer on single node if (comm->topo->nodes[GPU].count == comm->topo->nRanks && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) comm->p2pnChannelsPerPeer *= 2; - comm->p2pnChannels = nextPow2(comm->p2pnChannels); + comm->p2pnChannels = std::min(nextPow2(comm->p2pnChannels), 4*CHANNEL_LIMIT); } // Init channels that weren't used so far @@ -918,7 +918,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { // fill the whole space of nChannels. To do so we mirror the bits in the // nChannels space. for (int c=0; cp2pnChannels; c++) { - comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels); + comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels); } return ncclSuccess; } @@ -950,4 +950,4 @@ int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) { } } return minPath >= PATH_PIX ? 0 : 1; -} \ No newline at end of file +} diff --git a/projects/rccl/src/group.cc b/projects/rccl/src/group.cc index 1924c3978b..6200aab189 100644 --- a/projects/rccl/src/group.cc +++ b/projects/rccl/src/group.cc @@ -225,7 +225,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g for (int i = 0; i < comm->nRanks; i++) { comm->tasks.peers[i].sendSeen = false; comm->tasks.peers[i].recvSeen = false; - for (int j = 0; j < 4; j++) { + for (int j = 0; j < MAXCHANNELS/64; j++) { comm->connectSend[i].masks[j] = 0UL; comm->connectRecv[i].masks[j] = 0UL; } diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h index bcc2bbf465..83973c7605 100644 --- a/projects/rccl/src/include/comm.h +++ b/projects/rccl/src/include/comm.h @@ -173,7 +173,7 @@ struct ncclNvlsMcHandleList { }; struct channelMasks { - uint64_t masks[4]; + uint64_t masks[MAXCHANNELS/64]; }; struct ncclKernelPlan { diff --git a/projects/rccl/src/include/device.h b/projects/rccl/src/include/device.h index bd5957fb7e..4f0cec9490 100644 --- a/projects/rccl/src/include/device.h +++ b/projects/rccl/src/include/device.h @@ -61,7 +61,8 @@ union ncclLLFifoLine { }; #define WARP_SIZE warpSize -#define MAXCHANNELS 256 + +#define MAXCHANNELS 128 #define CHANNEL_LIMIT 16 #define NCCL_MAX_NTHREADS 256 diff --git a/projects/rccl/src/transport.cc b/projects/rccl/src/transport.cc index b67a248948..cc09bb0515 100644 --- a/projects/rccl/src/transport.cc +++ b/projects/rccl/src/transport.cc @@ -106,6 +106,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* timeLast = timeStart; // struct copy bool timeReported = false; + int count = 0; + int num = MAXCHANNELS/64; + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { @@ -123,7 +126,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* // The next M entries contain sendData, connection information for send connections // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections int p = i-(done+1); - if ((recvMask.masks[0]) || (sendMask.masks[0])) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS)); + count = 0; + for (int j = 0; j < num; j++) { + if ((recvMask.masks[j]) || (sendMask.masks[j])) { + count++; + } + } + + //if ((recvMask.masks[0]) || (sendMask.masks[0])) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS)); + if (count) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS)); + recvData[p] = data[p]; int sendChannels = 0, recvChannels = 0; int type; @@ -220,7 +232,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* } TIME_STOP(4); } - if (sendMask.masks[0] || recvMask.masks[0]) { + + count = 0; + for (int j = 0; j < num; j++) { + if ((recvMask.masks[j]) || (sendMask.masks[j])) { + count++; + } + } + //if (sendMask.masks[0] || recvMask.masks[0]) { + if (count) { free(data[p]); data[p] = NULL; } @@ -262,7 +282,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* int sendPeer = (comm->rank + i) % comm->nRanks; int flag = 0; - for (int j = 0; j < 4; j++) { + for (int j = 0; j < MAXCHANNELS/64; j++) { if (recvPeer != sendPeer) { if (comm->connectSend[sendPeer].masks[j] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);