@@ -241,7 +241,7 @@ __forceinline__ __device__ void ncclKernelMain(struct ncclDevComm* comm, struct
|
||||
y = __popcll(channelMask.masks[i] & ((1ull<<x)-1));
|
||||
y = total + y;
|
||||
if (blockIdx.x == y) {
|
||||
ncclShmem.channelId = y;
|
||||
ncclShmem.channelId = x + total;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -251,7 +251,7 @@ __forceinline__ __device__ void ncclKernelMain(struct ncclDevComm* comm, struct
|
||||
y = __popcll(channelMask.masks[i] & ((1ull<<x)-1));
|
||||
y = y + total;
|
||||
if (blockIdx.x == y) {
|
||||
ncclShmem.channelId = y;
|
||||
ncclShmem.channelId = x + total;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -618,7 +618,7 @@ static void finishPlan(struct ncclKernelPlan* plan) {
|
||||
int channelCount = 0;
|
||||
//uint64_t channelMask = 0;
|
||||
struct channelMasks channelMask;
|
||||
for (int i =0; i < 4; i++) {
|
||||
for (int i =0; i < MAXCHANNELS/64; i++) {
|
||||
channelMask.masks[i] = 0;
|
||||
}
|
||||
bool hasProxyOps = false;
|
||||
|
||||
@@ -908,7 +908,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? nextPow2(minChannels) : ncclParamNChannelsPerPeer());
|
||||
// Doubling P2P channels per peer on single node
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) comm->p2pnChannelsPerPeer *= 2;
|
||||
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
|
||||
comm->p2pnChannels = std::min(nextPow2(comm->p2pnChannels), 4*CHANNEL_LIMIT);
|
||||
}
|
||||
|
||||
// Init channels that weren't used so far
|
||||
@@ -918,7 +918,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
// fill the whole space of nChannels. To do so we mirror the bits in the
|
||||
// nChannels space.
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
|
||||
comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -950,4 +950,4 @@ int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
|
||||
}
|
||||
}
|
||||
return minPath >= PATH_PIX ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,7 +225,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
comm->tasks.peers[i].sendSeen = false;
|
||||
comm->tasks.peers[i].recvSeen = false;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
for (int j = 0; j < MAXCHANNELS/64; j++) {
|
||||
comm->connectSend[i].masks[j] = 0UL;
|
||||
comm->connectRecv[i].masks[j] = 0UL;
|
||||
}
|
||||
|
||||
@@ -173,7 +173,7 @@ struct ncclNvlsMcHandleList {
|
||||
};
|
||||
|
||||
struct channelMasks {
|
||||
uint64_t masks[4];
|
||||
uint64_t masks[MAXCHANNELS/64];
|
||||
};
|
||||
|
||||
struct ncclKernelPlan {
|
||||
|
||||
@@ -61,7 +61,8 @@ union ncclLLFifoLine {
|
||||
};
|
||||
|
||||
#define WARP_SIZE warpSize
|
||||
#define MAXCHANNELS 256
|
||||
|
||||
#define MAXCHANNELS 128
|
||||
#define CHANNEL_LIMIT 16
|
||||
|
||||
#define NCCL_MAX_NTHREADS 256
|
||||
|
||||
@@ -106,6 +106,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
timeLast = timeStart; // struct copy
|
||||
bool timeReported = false;
|
||||
|
||||
int count = 0;
|
||||
int num = MAXCHANNELS/64;
|
||||
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
|
||||
// First time initialization
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
@@ -123,7 +126,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
// The next M entries contain sendData, connection information for send connections
|
||||
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
|
||||
int p = i-(done+1);
|
||||
if ((recvMask.masks[0]) || (sendMask.masks[0])) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
|
||||
count = 0;
|
||||
for (int j = 0; j < num; j++) {
|
||||
if ((recvMask.masks[j]) || (sendMask.masks[j])) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
//if ((recvMask.masks[0]) || (sendMask.masks[0])) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
|
||||
if (count) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
|
||||
|
||||
recvData[p] = data[p];
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
int type;
|
||||
@@ -220,7 +232,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
}
|
||||
TIME_STOP(4);
|
||||
}
|
||||
if (sendMask.masks[0] || recvMask.masks[0]) {
|
||||
|
||||
count = 0;
|
||||
for (int j = 0; j < num; j++) {
|
||||
if ((recvMask.masks[j]) || (sendMask.masks[j])) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
//if (sendMask.masks[0] || recvMask.masks[0]) {
|
||||
if (count) {
|
||||
free(data[p]);
|
||||
data[p] = NULL;
|
||||
}
|
||||
@@ -262,7 +282,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
int flag = 0;
|
||||
|
||||
for (int j = 0; j < 4; j++) {
|
||||
for (int j = 0; j < MAXCHANNELS/64; j++) {
|
||||
if (recvPeer != sendPeer) {
|
||||
if (comm->connectSend[sendPeer].masks[j] != 0UL)
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
|
||||
Reference in New Issue
Block a user