Add support for IB SHARP 1PPN operation with user buffers.
Improve support for MNNVL, add NVLS support and multi-clique support.
 * Detect the NVLS clique through NVML
 * Exchange XML between peers in the same NVLS clique and fuse XMLs
   before creating the topology graph.
 * Rework bootstrap allgather algorithms to allow for large allgather
   operations intra-node (XML exchange).
Net/IB: add support for dynamic GID detection.
 * Automatically select RoCEv2/IPv4 interface by default. Allow to
   select IPv6 or even the network/mask.
Reduce NVLS memory usage.
 * Add stepSize as property of a connection to allow for different
   sizes on different peers; set it to 128K for NVLink SHARP.
Improve tuner loading
 * Look for more paths, be more consistent with the network device
   plugin.
 * Also search for tuner support inside the net plugin.
Improve tuner API
 * Add context to support multi-device per process.
Add magic number around comm object to detect comm corruption.
 * Add some basic check around communicators so that we can report a
   problem when a communicator gets corrupted or a wrong comm pointer
   is passed to NCCL.
Fix net/IB error path. Github PR #1164
Fix collnet rail mapping with split comm.
Fix packet reordering issue causing bootstrap mismatch
 * Use a different tag in ncclTransportP2pSetup for the connectInfo
   exchange and the following barrier.
Fix hang when crossNic is inconsistent between ranks.
Fix minCompCap/maxCompCap computation. Github issue #1184


[ROCm/rccl commit: ab2b89c4c3]
Šī revīzija ir iekļauta:
Sylvain Jeaugey
2024-03-26 06:08:55 -07:00
vecāks b492ab6313
revīzija 792c33598f
52 mainīti faili ar 2124 papildinājumiem un 944 dzēšanām
@@ -7,8 +7,6 @@
#ifndef NET_DEVICE_H_
#define NET_DEVICE_H_
#include "net_device.h"
#define NCCL_NET_DEVICE_INVALID_VERSION 0x0
#define NCCL_NET_MTU_SIZE 4096
@@ -39,13 +39,17 @@ typedef struct {
const char* name;
// Initializes tuner states.
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// nNodes: number of nodes in current communicator.
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
// Inputs:
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// - nNodes: number of nodes in current communicator.
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
// Outputs:
// - context: tuner context object
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetSupport: whether collnet supports this type
@@ -62,16 +66,17 @@ typedef struct {
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
ncclResult_t (*destroy)();
} ncclTuner_v1_t;
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
typedef ncclTuner_v1_t ncclTuner_t;
typedef ncclTuner_v2_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#endif
@@ -8,17 +8,17 @@
#define __hidden __attribute__ ((visibility("hidden")))
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes,
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
__hidden ncclResult_t pluginDestroy() { return ncclSuccess; }
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
#define PLUGIN_NAME "Example"
const ncclTuner_v1_t ncclTunerPlugin_v1 = {
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
+1 -1
Parādīt failu
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 20
NCCL_MINOR := 21
NCCL_PATCH := 5
NCCL_SUFFIX :=
PKG_REVISION := 1
+148 -103
Parādīt failu
@@ -80,6 +80,16 @@ static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int si
NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size)));
return ncclSuccess;
}
static ncclResult_t bootstrapNetSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock, void* recvData, int recvSize) {
int senderRecvSize;
NCCLCHECK(ncclSocketSendRecv(sendSock, &sendSize, sizeof(int), recvSock, &senderRecvSize, sizeof(int)));
if (senderRecvSize > recvSize) {
WARN("Message truncated : received %d bytes instead of %d", senderRecvSize, recvSize);
return ncclInternalError;
}
NCCLCHECK(ncclSocketSendRecv(sendSock, sendData, sendSize, recvSock, recvData, recvSize));
return ncclSuccess;
}
struct extInfo {
int rank;
@@ -390,103 +400,40 @@ fail:
goto exit;
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
// Bootstrap send/receive functions
//
// We do not keep connections opened with all ranks at all times, and we have no guarantee
// that connections to our unique listen socket will arrive in the same order as we need
// them. Therefore, when establishing a connection, the sender sends a (peer, tag) tuple to
// allow the receiver to identify the flow, and keep it in an unexpected queue if needed.
ncclResult_t bootstrapConnect(void* commState, int peer, int tag, struct ncclSocket* sock) {
ncclResult_t ret = ncclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
char* data = (char*)allData;
int rank = state->rank;
int nranks = state->nranks;
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
/* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from left
* and send previous step's data from (rank-i) to right
*/
for (int i=0; i<nranks-1; i++) {
size_t rslice = (rank - i - 1 + nranks) % nranks;
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
NCCLCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size));
// Recv slice from the left
NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
NCCLCHECKGOTO(ncclSocketInit(sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(sock, &state->rank, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(sock, &tag, sizeof(int)), ret, fail);
return ncclSuccess;
fail:
NCCLCHECK(ncclSocketClose(sock));
return ret;
}
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
ncclResult_t ret = ncclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
struct ncclSocket sock;
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail);
TRACE(NCCL_BOOTSTRAP, "Sending to peer=%d tag=%d size=%d", peer, tag, size);
NCCLCHECK(bootstrapConnect(commState, peer, tag, &sock));
NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, exit);
TRACE(NCCL_BOOTSTRAP, "Sent to peer=%d tag=%d size=%d", peer, tag, size);
exit:
NCCLCHECK(ncclSocketClose(&sock));
return ret;
fail:
goto exit;
}
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
if (nranks == 1) return ncclSuccess;
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);
/* Simple intra process barrier
*
* Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
* "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
*/
int data[1];
for (int mask=1; mask<nranks; mask<<=1) {
int src = (rank - mask + nranks) % nranks;
int dst = (rank + mask) % nranks;
NCCLCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data)));
NCCLCHECK(bootstrapRecv(commState, ranks[src], tag, data, sizeof(data)));
}
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag);
return ncclSuccess;
}
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
if (nranks == 1) return ncclSuccess;
char* data = (char*)allData;
TRACE(NCCL_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);
for (int i=1; i<nranks; i++) {
int src = (rank - i + nranks) % nranks;
int dst = (rank + i) % nranks;
NCCLCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data+rank*size, size));
NCCLCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data+src*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return ncclSuccess;
}
// IntraNode in-place Broadcast
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
if (nranks == 1) return ncclSuccess;
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
if (rank == root) {
for (int i=0; i<nranks; i++) {
if (i != root) NCCLCHECK(bootstrapSend(commState, ranks[i], /*tag=*/ranks[i], bcastData, size));
}
}
else {
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/ranks[rank], bcastData, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
return ncclSuccess;
}
ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
@@ -543,38 +490,136 @@ static void unexpectedFree(struct bootstrapState* state) {
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
ncclResult_t bootstrapAccept(void* commState, int peer, int tag, struct ncclSocket* sock) {
ncclResult_t ret = ncclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
struct ncclSocket sock;
int newPeer, newTag;
// Search unexpected connections first
int found;
NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
if (found) {
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
NCCLCHECK(unexpectedDequeue(state, peer, tag, sock, &found));
if (found) return ncclSuccess;
// Then look for new connections
while (1) {
NCCLCHECKGOTO(ncclSocketInit(&sock), ret, fail);
NCCLCHECKGOTO(ncclSocketAccept(&sock, &state->listenSock), ret, fail);
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
if (newPeer == peer && newTag == tag) {
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
// Unexpected connection. Save for later.
NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(sock), ret, fail);
NCCLCHECKGOTO(ncclSocketAccept(sock, &state->listenSock), ret, fail);
NCCLCHECKGOTO(bootstrapNetRecv(sock, &newPeer, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetRecv(sock, &newTag, sizeof(int)), ret, fail);
if (newPeer == peer && newTag == tag) return ncclSuccess;
NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, sock), ret, fail);
}
return ncclSuccess;
fail:
NCCLCHECK(ncclSocketClose(sock));
return ret;
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
ncclResult_t ret;
struct ncclSocket sock;
NCCLCHECK(bootstrapAccept(commState, peer, tag, &sock));
TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, exit);
exit:
NCCLCHECK(ncclSocketClose(&sock));
return ret;
fail:
goto exit;
}
// Collective algorithms, based on bootstrapSend/Recv, and sometimes bootstrapConnect/Accept
ncclResult_t bootstrapRingAllGather(struct ncclSocket* prevSocket, struct ncclSocket* nextSocket, int rank, int nranks, char* data, int size) {
/* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from prev
* and send previous step's data from (rank-i) to next
*/
for (int i=0; i<nranks-1; i++) {
size_t rslice = (rank - i - 1 + nranks) % nranks;
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right, recv slice from the left
NCCLCHECK(bootstrapNetSendRecv(nextSocket, data+sslice*size, size, prevSocket, data+rslice*size, size));
}
return ncclSuccess;
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct bootstrapState* state = (struct bootstrapState*)commState;
int rank = state->rank;
int nranks = state->nranks;
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
NCCLCHECK(bootstrapRingAllGather(&state->ringRecvSocket, &state->ringSendSocket, rank, nranks, (char*)allData, size));
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return ncclSuccess;
}
ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
if (nranks == 1) return ncclSuccess;
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);
/* Simple [intra] process barrier
*
* Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
* "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
*/
int data[1];
for (int mask=1; mask<nranks; mask<<=1) {
int src = (rank - mask + nranks) % nranks;
int dst = (rank + mask) % nranks;
NCCLCHECK(bootstrapSend(commState, ranks ? ranks[dst] : dst, tag, data, sizeof(data)));
NCCLCHECK(bootstrapRecv(commState, ranks ? ranks[src] : src, tag, data, sizeof(data)));
}
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag);
return ncclSuccess;
}
ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag) {
return bootstrapIntraNodeBarrier(commState, NULL, rank, nranks, tag);
}
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
if (nranks == 1) return ncclSuccess;
TRACE(NCCL_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);
int prevRank = ranks[(rank - 1 + nranks)%nranks];
int nextRank = ranks[(rank + 1) % nranks];
struct ncclSocket prevSocket, nextSocket;
NCCLCHECK(bootstrapConnect(commState, nextRank, 0, &nextSocket));
NCCLCHECK(bootstrapAccept(commState, prevRank, 0, &prevSocket));
NCCLCHECK(bootstrapRingAllGather(&prevSocket, &nextSocket, rank, nranks, (char*)allData, size));
NCCLCHECK(ncclSocketClose(&nextSocket));
NCCLCHECK(ncclSocketClose(&prevSocket));
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return ncclSuccess;
}
// [IntraNode] in-place Broadcast
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
if (nranks == 1) return ncclSuccess;
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
if (rank == root) {
for (int i=0; i<nranks; i++) {
if (i != root) NCCLCHECK(bootstrapSend(commState, ranks ? ranks[i] : i, /*tag=*/ranks ? ranks[i] : i, bcastData, size));
}
}
else {
NCCLCHECK(bootstrapRecv(commState, ranks ? ranks[root] : root, /*tag=*/ranks ? ranks[rank] : rank, bcastData, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
return ncclSuccess;
}
ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size) {
return bootstrapIntraNodeBroadcast(commState, NULL, rank, nranks, root, bcastData, size);
}
ncclResult_t bootstrapClose(void* commState) {
+7 -5
Parādīt failu
@@ -13,7 +13,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
if (channel->id != -1) return ncclSuccess;
int nRanks = comm->nRanks;
int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */;
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
channel->id = channelId;
channel->workFifoSent = 0;
@@ -73,10 +74,11 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
if (share) {
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
for (int r = 0; r < comm->localRanks; ++r) {
for (int r = 0; r < nvlsRanks; ++r) {
int tr = comm->topParentLocalRanks[r];
uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
@@ -85,9 +87,9 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
}
} else {
NCCLCHECK(ncclCalloc(&channel->nvlsPeers, comm->localRanks));
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, comm->localRanks, sharedRes->deviceStream.cudaStream));
for (int r = 0; r < comm->localRanks; ++r) {
NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream));
for (int r = 0; r < nvlsRanks; ++r) {
uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+16 -8
Parādīt failu
@@ -23,7 +23,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -46,7 +47,8 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
@@ -67,14 +69,16 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
return ncclEnqueueCheck(&info);
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -98,7 +102,8 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
@@ -120,7 +125,8 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
NCCLCHECK(ncclEnqueueCheck(&info));
return ncclSuccess;
}
struct NvtxParamsSendRecv {
@@ -144,7 +150,8 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
exit:
NCCLCHECK(ncclGroupEnd());
return ret;
}
@@ -161,7 +168,8 @@ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
exit:
NCCLCHECK(ncclGroupEnd());
return ret;
}
+4
Parādīt failu
@@ -79,6 +79,10 @@ void ncclDebugInit() {
mask = NCCL_PROXY;
} else if (strcasecmp(subsys, "NVLS") == 0) {
mask = NCCL_NVLS;
} else if (strcasecmp(subsys, "BOOTSTRAP") == 0) {
mask = NCCL_BOOTSTRAP;
} else if (strcasecmp(subsys, "REG") == 0) {
mask = NCCL_REG;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
+38 -22
Parādīt failu
@@ -253,18 +253,26 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
int tn = nWarps1*WARP_SIZE;
if (tid < tn) {
// Phase 1: send to network
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
/*redOpArg=*/0, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
ssize_t railOneEnd = railOneBeg + sizePerRank;
ssize_t beg = max(railAllBeg, railOneBeg);
ssize_t end = min(railAllEnd, railOneEnd);
prims.send(beg-railOneBeg, max(ssize_t(0), end-beg));
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
}
__syncwarp();
} else {
// Phase 1: send to network
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
/*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
ssize_t railOneEnd = railOneBeg + sizePerRank;
ssize_t beg = max(railAllBeg, railOneBeg);
ssize_t end = min(railAllEnd, railOneEnd);
prims.send(beg - railOneBeg, max(ssize_t(0), end - beg));
}
}
return;
}
@@ -272,16 +280,24 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
tn = nWarps2*WARP_SIZE;
if (tid < tn) {
// Phase 2: Recv network -> deposit output + send to bcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, tn, &direct->out, direct->heads+1, nullptr, nullptr,
/*redOpArg=*/0, 1*Proto::MaxGroupWidth, 0, 0);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*BcastSendNotRecv=*/true> scat;
scat.args = args;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/1, /*Send=*/1>(scat);
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
}
__syncwarp();
} else {
// Phase 2: Recv network -> deposit output + send to bcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr,
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*BcastSendNotRecv=*/true> scat;
scat.args = args;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/1, /*Send=*/1>(scat);
}
}
return;
}
+72 -29
Parādīt failu
@@ -297,13 +297,21 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
} else {
// Directly send to network
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == tidStartReduce) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
}
__syncwarp();
} else {
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
}
}
}
} else if (tid < tidStartBcast && hasUp) {
@@ -328,14 +336,22 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
}
} else {
// Recv from network (no post thread needed)
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem, /*postOp=*/true);
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == tidStartBcast) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
}
__syncwarp();
} else {
// Recv from network (no post thread needed)
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
prims.recv(offset, nelem, /*postOp=*/true);
}
}
}
}
@@ -616,21 +632,31 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
groupNthreads = nthreads-nthreadsSplit;
}
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex);
if (tid < nthreadsSplit) {
if (recv == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (groupTid == 0) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
}
__syncwarp();
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.send(offset, nelem);
}
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recvReduceSend(offset, nelem);
}
}
@@ -639,19 +665,36 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
if (recv == nranks) {
// I'm the first in the broadcast chain, I need to perform the division (postOp)
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem, /*postOp*/true);
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (groupTid == 0) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
}
__syncwarp();
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recv(offset, nelem, /*postOp*/true);
}
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
}
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
+8 -6
Parādīt failu
@@ -18,19 +18,21 @@ typedef void(*ncclDevFuncPtr_t)();
extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
struct ncclShmemGroup {
ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
void* srcs[NCCL_MAX_NVLS_ARITY+1];
void* dsts[NCCL_MAX_NVLS_ARITY+1];
ncclConnInfo *recvConns[NCCL_MAX_ARITY];
ncclConnInfo *sendConns[NCCL_MAX_ARITY];
void* userInput;
void* userOutput;
void* srcs[NCCL_MAX_ARITY+1];
void* dsts[NCCL_MAX_ARITY+1];
union {
unpackGroupShmem unpack;
} devicePlugin;
int32_t dstSizes[NCCL_MAX_NVLS_ARITY+1];
int32_t dstSizes[NCCL_MAX_ARITY+1];
};
struct ncclShmemData {
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
uint64_t redOpArgs[NCCL_MAX_ARITY+1];
int channelId;
int aborted;
alignas(16) struct ncclDevComm comm;
+64 -47
Parādīt failu
@@ -5,6 +5,7 @@
************************************************************************/
#include "network/unpack/unpack.h"
#include <cassert>
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
@@ -13,9 +14,7 @@ class Primitives<
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
static constexpr int RoleInput = 0x01,
RoleOutput = 0x02,
RoleWaitRecv = 0x04,
static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use
RoleWaitSend = 0x08,
RolePostSend = 0x10,
RolePostRecv = 0x20,
@@ -40,13 +39,11 @@ class Primitives<
int group;
uint64_t step;
struct ncclConnFifo* connFifo = NULL;
union {
T *userBuff; // (flags & (RoleInput|RoleOutput))
T *connEltsFifo; // !(flags & (RoleInput|RoleOutput))
};
T *directBuff;
T* connEltsFifo;
T* directBuff;
uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
int connStepSize; // Connection step size
void* mhandle;
void* netDeviceHandle;
@@ -153,7 +150,7 @@ class Primitives<
} else if (flags & DirectRead) { // empty send
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
} else if (!isSendNotRecv && DirectRecv) {
if (flags & (DirectRead | NvlsDirectRead)) {
@@ -161,11 +158,11 @@ class Primitives<
} else if (flags & DirectWrite) {
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
}
else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
ncclNetDeviceIncrementHead(group);
@@ -232,10 +229,12 @@ class Primitives<
#endif
do {
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
if (tid == 0) {
T* userInput = (T*)ncclShmem.groups[group].userInput;
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
if (Src) ncclShmem.groups[group].srcs[0] = (SrcBuf==Input ? userInput : userOutput) + srcIx + offset;
if (Dst) ncclShmem.groups[group].dsts[0] = (DstBuf==Input ? userInput : userOutput) + dstIx + offset;
}
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
@@ -303,6 +302,28 @@ class Primitives<
}
public:
static inline __device__ void sendPeerNotify(int peer, int connIndex, int steps) {
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
peerPtr->send[connIndex].step += steps;
st_relaxed_sys_global(peerPtr->send[connIndex].tail, peerPtr->send[connIndex].step);
}
static inline __device__ void recvPeerNotify(int peer, int connIndex, int steps) {
int spins = 0;
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
peerPtr->recv[connIndex].step += steps;
st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) {
if (*ncclShmem.comm.abortFlag) {
ncclShmem.aborted = 1;
break;
}
spins = 0;
}
}
}
template<int Recv, int Send, typename Fn>
__device__ __forceinline__ void process(Fn &&fn) {
#pragma unroll 1
@@ -371,7 +392,7 @@ private:
if (Send) {
// Scatter pre-scales data of input buffer only in non-Direct case
constexpr int PreOpSrcs = DirectSend ? 0 : 1;
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
if (tid==0) ncclShmem.groups[group].srcs[0] = (T*)ncclShmem.groups[group].userInput + inpIx + offset;
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
subBarrier();
@@ -391,7 +412,7 @@ private:
}
}
} else if (Recv) {
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
if (tid==0) ncclShmem.groups[group].dsts[0] = (T*)ncclShmem.groups[group].userOutput + outIx + offset;
ssize_t pOffset = index*peerOffset;
if (skip >= 0 && index >= skip) pOffset += peerElem;
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
@@ -436,6 +457,7 @@ private:
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (conn->connFifo != nullptr) {
flags |= ConnFifoEnabled;
@@ -484,6 +506,7 @@ private:
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (connFifo == nullptr && Direct) {
// User buffers have been registered
@@ -528,24 +551,19 @@ private:
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
this->fan = Fan(nrecv, nsend);
constexpr int ThreadPerSync = 8;
constexpr int ThreadPerSync =
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
int g = tid / ThreadPerSync;
int ng = nthreads / ThreadPerSync;
index = tid % ThreadPerSync;
index = -1;
flags = 0;
if (g == 0) {
if (index < nrecv) flags |= RoleWaitRecv;
if (index == nrecv) flags |= RoleInput;
} else if (g == 1) {
if (index < nsend) flags |= RoleWaitSend;
if (index == nsend) flags |= RoleOutput;
} else if (g == ng - 2) {
if (index < nrecv) flags |= RolePostRecv;
} else if (g == ng - 1) {
if (index < nsend) flags |= RolePostSend;
}
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
int peer = 0;
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
@@ -558,15 +576,11 @@ private:
if (barrierAny(flags & NetDeviceUnpack)) {
flags |= AnyNetDeviceUnpack;
// g == 0 is the first ThreadPerSync # of threads of this warp
// g == 0 is also the RoleWaitRecv threads of this group, thus the thread ID will correlate to the peer index
if (g == 0) {
uint32_t mask = __ballot_sync((1U << ThreadPerSync) - 1, (flags & NetDeviceUnpack) ? 1 : 0);
// We only want to update the shared memory variable with a single thread
if (tid == 0) {
ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
}
// RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
// have NetDeviceUnpack.
uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
if (tid == 0) {
ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
}
}
@@ -588,7 +602,8 @@ private:
// was accessed directly.
uint64_t prevStep = step - StepPerSlice;
volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
while (*ptr != -1);
int spins = 0;
while (*ptr != -1) if (checkAbort(spins)) break;
}
if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
@@ -601,11 +616,11 @@ private:
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
if (flags & RoleInput) {
userBuff = (T*)inputBuf;
if (tid==0) {
ncclShmem.groups[group].userInput = (void*)inputBuf;
ncclShmem.groups[group].userOutput = (void*)outputBuf;
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
}
if (flags & RoleOutput) userBuff = (T*)outputBuf;
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
@@ -696,8 +711,10 @@ private:
}
__device__ void moveDataPtrs(intptr_t delta) {
if (flags & (RoleInput|RoleOutput))
userBuff += delta;
if (tid==0) {
ncclShmem.groups[group].userInput = (T*)ncclShmem.groups[group].userInput + delta;
ncclShmem.groups[group].userOutput = (T*)ncclShmem.groups[group].userOutput + delta;
}
}
__device__ __forceinline__ void send(intptr_t inpIx, int eltN) {
+38 -22
Parādīt failu
@@ -262,16 +262,24 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
tn = nWarps2*WARP_SIZE;
if (tid < tn) {
// Phase 2: Reduce from peers + local input -> send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid, tn, direct->heads+1, &direct->out, nullptr, nullptr,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*ReduceSendNotRecv=*/false> scat;
scat.args = args;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/1, /*Send=*/1>(scat);
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
}
__syncwarp();
} else {
// Phase 2: Reduce from peers + local input -> send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*ReduceSendNotRecv=*/false> scat;
scat.args = args;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/1, /*Send=*/1>(scat);
}
}
return;
}
@@ -279,18 +287,26 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
tn = nWarps3*WARP_SIZE;
if (tid < tn) {
// Phase 3: recv from network
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
ssize_t railOneEnd = railOneBeg + sizePerRank;
ssize_t beg = max(railAllBeg, railOneBeg);
ssize_t end = min(railAllEnd, railOneEnd);
prims.recv(beg-railOneBeg, max(ssize_t(0), end-beg), /*postOp=*/true);
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
}
__syncwarp();
} else {
// Phase 3: recv from network
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
ssize_t railOneEnd = railOneBeg + sizePerRank;
ssize_t beg = max(railAllBeg, railOneBeg);
ssize_t end = min(railAllEnd, railOneEnd);
prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true);
}
}
return;
}
+61 -10
Parādīt failu
@@ -680,6 +680,36 @@ static ncclResult_t registerIntraNodeBuffers(
}
}
info->regBufType = NCCL_IPC_REG_BUFFER;
} else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opFull.op != ncclDevPreMulSum && info->opFull.op != ncclDevSumPostDiv) {
int sendRegBufFlag = 0;
int recvRegBufFlag = 0;
void *sendHandle, *recvHandle;
if (ncclParamLocalRegister()) {
ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
info->sendMhandle = sendHandle;
if (sendRegBufFlag) {
ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
info->recvMhandle = recvHandle;
}
}
if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && plan->persistent && ncclParamGraphRegister()) {
ncclCollnetGraphRegisterBuffer(comm, plan, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
info->sendMhandle = sendHandle;
if (sendRegBufFlag) {
ncclCollnetGraphRegisterBuffer(comm, plan, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
info->recvMhandle = recvHandle;
}
}
if (sendRegBufFlag && recvRegBufFlag) {
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
info->regBufType = NCCL_COLLNET_REG_BUFFER;
if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, info->sendbuffSize, info->recvbuff, recvHandle, info->recvbuffSize);
}
}
}
fallback:
#endif
@@ -806,7 +836,7 @@ static ncclResult_t scheduleCollTasksToPlan(
while (!ncclIntruQueueEmpty(&tasks->collCBDQueue)) {
// Get nChannels and peek whether the budget allows before we enqueue
collInfo = ncclIntruQueueHead(&tasks->collCBDQueue);
collInfo->nChannels = DIVUP(collInfo->aggnBytes * tasks->usableChannels, totalCBDBytes);
collInfo->nChannels = DIVUP(collInfo->workBytes * tasks->usableChannels, totalCBDBytes);
// Haven't got nChannels info yet, relax the budget boundary a bit.
if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;
@@ -1173,6 +1203,12 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj);
}
while (!ncclIntruQueueEmpty(&plan->collnetHandleQueue)) {
struct ncclCollnetHandleList* obj = ncclIntruQueueDequeue(&plan->collnetHandleQueue);
NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyconn, obj->collnetHandle));
INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->collnetHandle, obj->size, obj->buffer);
ncclMemoryPoolFree(&comm->memPool_ncclCollnetHandleList, obj);
}
}
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
return ncclSuccess;
@@ -1512,7 +1548,7 @@ static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport,
collInfo->nChannels = 0;
if (collInfo->comm->tuner != NULL) {
NCCLCHECK(collInfo->comm->tuner->getCollInfo(
collInfo->coll, collInfo->nBytes,
collInfo->comm->tunerContext, collInfo->coll, collInfo->nBytes,
collNetSupport, nvlsSupport, numPipeOps,
&collInfo->algorithm, &collInfo->protocol, &collInfo->nChannels));
}
@@ -1649,7 +1685,7 @@ static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, siz
static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg) {
if (regBufType == NCCL_IPC_REG_BUFFER) {
workElemReg->elem = *work;
workElemReg->elem.regUsed = 1;
workElemReg->elem.regUsed = NCCL_IPC_REG_BUFFER;
for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) {
int peer = channel->collnetDirect.down[i];
if (peer == -1) break;
@@ -1666,10 +1702,13 @@ static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkEl
}
} else if (regBufType == NCCL_NVLS_REG_BUFFER) {
workElemReg->elem = *work;
workElemReg->elem.regUsed = 1;
workElemReg->elem.regUsed = NCCL_NVLS_REG_BUFFER;
/* NVLS only has one send and recv buffer registered */
workElemReg->dnInputs[0] = regBufSend[0];
workElemReg->dnOutputs[0] = regBufRecv[0];
} else if (regBufType == NCCL_COLLNET_REG_BUFFER) {
workElemReg->elem = *work;
workElemReg->elem.regUsed = NCCL_COLLNET_REG_BUFFER;
} else {
/* impossible value */
WARN("Invalid regBufType %d\n", regBufType);
@@ -1678,7 +1717,7 @@ static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkEl
return ncclSuccess;
}
NCCL_PARAM(NvlsTreeChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2);
NCCL_PARAM(NvlsTreeMaxChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2);
static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels) {
int stepSize = collInfo->comm->buffSizes[collInfo->protocol] / NCCL_STEPS;
@@ -1701,7 +1740,7 @@ static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nByte
while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2;
while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
} else if (collInfo->algorithm == NCCL_ALGO_NVLS) {
int maxChunkSize = 131072;
int maxChunkSize = collInfo->comm->nvlsChunkSize;
if (collInfo->comm->nNodes > 1 && collInfo->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
@@ -1712,7 +1751,8 @@ static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nByte
} else if (collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads;
int maxChunkSize = ncclParamNvlsTreeChunkSize();
chunkSize = collInfo->comm->nvlsChunkSize;
int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize();
if (maxChunkSize == -2) maxChunkSize = collInfo->comm->nNodes >= 4 ? 65536 : chunkSize;
chunkSize = std::min(chunkSize, maxChunkSize);
if ((nBytes < (32 * (concurrentOps * chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
@@ -1747,11 +1787,22 @@ static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, ui
proxyOp->pattern = collInfo->pattern;
proxyOp->coll = collInfo->coll;
proxyOp->root = collInfo->root;
proxyOp->reg = 0;
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
// because some protocols need to transmit more than the total size, plus they sometimes
// round up
proxyOp->nbytes = collInfo->stepSize * proxyOp->sliceSteps;
if (collInfo->regBufType == NCCL_COLLNET_REG_BUFFER) {
proxyOp->reg = 1;
proxyOp->nsteps = DIVUP(collInfo->nBytes, NCCL_MAX_COLLNET_SIZE);
proxyOp->sendMhandle = collInfo->sendMhandle;
proxyOp->recvMhandle = collInfo->recvMhandle;
proxyOp->sendbuff = (uint8_t*)collInfo->sendbuff;
proxyOp->recvbuff = (uint8_t*)collInfo->recvbuff;
proxyOp->nbytes = collInfo->nBytes;
} else {
proxyOp->reg = 0;
}
proxyOp->channelId = channelId;
proxyOp->opCount = opCount;
@@ -1958,7 +2009,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
ncclResult_t ret = ncclSuccess;
int devOld = -1;
NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, fail);
NCCLCHECKGOTO(CommCheck(info->comm, info->opName, "comm"), ret, fail);
// Check whether communicator is ready to communicate
NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail);
@@ -1990,7 +2041,7 @@ fail:
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
NCCLCHECK(CommCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
/* join init thread before creating PreMulSum op. */
NCCLCHECK(ncclCommEnsureReady(comm));
+18 -8
Parādīt failu
@@ -17,6 +17,7 @@
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
int nChannels = comm->nChannels;
topoRanks->nvlsHeadNum = 0;
@@ -71,7 +72,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
// Get nvls heads and the number of heads. Duplicate head is not allowed.
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
bool addHead = true;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
@@ -257,8 +258,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
channel->nvls.nNodes = comm->nNodes;
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
}
// MNNVL: NVLS not yet supported
if (comm->nNodes == 1 || comm->MNNVL) return ncclSuccess;
if (comm->nNodes == 1) return ncclSuccess;
// Connect Trees
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
@@ -310,9 +310,9 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
struct ncclNvls* nvls0 = &comm->channels[0].nvls;
struct ncclNvls* nvls1 = &comm->channels[1].nvls;
INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp,
nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp);
INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d",
nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp,
nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp);
return ncclSuccess;
}
@@ -363,13 +363,14 @@ void exchangeValues(int* v0, int* v1) {
*v0 = tmp;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs) {
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
int minHeadNum = INT_MAX;
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
@@ -380,7 +381,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
// Alternate rings to avoid crossing rails
if (graphs[NCCL_ALGO_RING]->crossNic && (comm->nNodes % 2) == 0 && (nChannels % 2) == 0) {
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
for (int r=0; r<comm->nRanks; r++) {
if (comm->rankToNode[r] % 2 == 1) {
// Exchange rings
@@ -469,11 +470,20 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
comm->collChannels = comm->nChannels;
#if CUDART_VERSION >= 12010
// Support maximal channel usage for aggregation
if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
comm->nvlsChannels = parent->nvlsResources->nChannels;
}
if (comm->nChannels < comm->nvlsChannels) {
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
}
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
#endif
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
comm->collChannels = std::min(comm->collChannels, comm->nChannels);
}
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+32 -32
Parādīt failu
@@ -58,6 +58,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
struct ncclTopoNode* remNode = link->remNode;
if (remNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
for (int i=0; i<system->nodes[baseNode->type].count; i++) remNode->paths[baseNode->type][i].type = PATH_DIS;
}
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
@@ -110,11 +111,12 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
}
static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
char line[1024];
const int linesize = 1024;
char line[linesize];
#ifdef ENABLE_TRACE
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
#else
sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
int offset = strlen(line);
#endif
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
@@ -126,12 +128,12 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
for (int i=0; i<node->paths[t][n].count; i++) {
struct ncclTopoLink* link = node->paths[t][n].list[i];
struct ncclTopoNode* remNode = link->remNode;
sprintf(line+offset, "--%s(%g)->%s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], remNode->id);
snprintf(line+offset, linesize-offset, "--%s(%g)->%s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], NCCL_TOPO_ID_SYSTEM_ID(remNode->id), NCCL_TOPO_ID_LOCAL_ID(remNode->id));
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
#else
sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]);
snprintf(line+offset, linesize-offset, "%s/%lx-%lx (%d/%.1f/%s) ", topoNodeTypeStr[t], NCCL_TOPO_ID_SYSTEM_ID(system->nodes[t].nodes[n].id), NCCL_TOPO_ID_LOCAL_ID(system->nodes[t].nodes[n].id), node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]);
offset = strlen(line);
#endif
}
@@ -361,12 +363,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
int ncclTopoUserGdrLevel = -1;
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int64_t netId, int read, int* useGdr) {
*useGdr = 0;
// Get GPU and NET
int n, g;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
@@ -403,18 +405,18 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
if (distance == PATH_PXN) {
// In case of PXN, use the intermediate GPU distance instead
int proxyRank, g;
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
distance = proxyGpu->paths[NET][n].type;
}
if (distance > netGdrLevel) {
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %lx (distance %d > %d)", busId, netId, distance, netGdrLevel);
return ncclSuccess;
}
*useGdr = 1;
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %lx (distance %d <= %d), read %d", busId, netId, distance, netGdrLevel, read);
return ncclSuccess;
}
@@ -465,10 +467,10 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_
return ncclSuccess;
}
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank) {
// Get GPU and NET
int n, g;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
struct ncclTopoLinkList* path = gpu->paths[NET]+n;
@@ -480,7 +482,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
type = node->type;
}
if (type != GPU) {
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
WARN("Could not find intermediate GPU between GPU rank %d and NIC %lx", rank, netId);
return ncclInternalError;
}
*intermediateRank = node->gpu.rank;
@@ -516,11 +518,12 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
int nr = 0;
int* ranks = NULL;
for (int rank=0; rank<comm->nRanks; rank++) {
int netDev, proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
int64_t netId;
int proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
if (proxyRank == comm->rank) continue;
int useGdr;
NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netId, 1, &useGdr));
if (useGdr == 0) continue;
int found = 0;
for (int r=0; r<nr; r++) {
@@ -603,13 +606,14 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (ncclPxnDisable(comm) != 1) {
int localGpuIndex;
NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
NCCLCHECK(ncclTopoGetLocalGpu(system, netNode->id, &localGpuIndex));
if (localGpuIndex != g && localGpuIndex != -1) {
// PXN = PCI + NVLink.
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
// Only use PXN for NIC n if remote GPU p ...
if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
@@ -618,15 +622,17 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
if (gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
if (gpu->paths[NET][n].type < PATH_PHB) {
// Update path when we dont want to / can't use GPU Direct RDMA.
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
if (gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
}
}
}
@@ -669,8 +675,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
}
// MNNVL: Remove network nodes as they are connected via NVLink
if (system->nodes[GPU].count == comm->nRanks || comm->MNNVL) {
if (system->nodes[GPU].count == comm->nRanks) {
for (int n=system->nodes[NET].count-1; n>=0; n--)
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
}
@@ -704,11 +709,6 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
} else {
*nChannels = 2;
}
} else if (comm->MNNVL) {
// MNNVL assume all GPUs are connected via NVLink
path = system->nodes[GPU].nodes[g].paths[GPU]+((g+1)%system->nodes[GPU].count);
float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
*nChannels = 2*std::max(1, (int)(path->bw / nvlBw));
} else {
// Remote rank, use network
int nNetChannels = ncclParamNChannelsPerNetPeer();
+63 -48
Parādīt failu
@@ -4,6 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
#include "core.h"
#include "graph.h"
#include "topo.h"
@@ -39,6 +40,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
int inter = system->nodes[NET].count;
if (inter == 0 && system->nodes[GPU].count == 1) {
system->maxBw = LOC_BW;
system->totalBw = LOC_BW;
return ncclSuccess;
}
for (int g=0; g<system->nodes[GPU].count; g++) {
@@ -115,7 +117,6 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
return ncclInternalError;
}
if (path->count == 0 ) return ncclSuccess;
// Now check link type
*node = NULL;
@@ -217,7 +218,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
}
static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) {
int netId = graph->inter[graph->nChannels*2];
int64_t netId = graph->inter[graph->nChannels*2];
int n;
NCCLCHECK(getNetIndex(system, netId, &n));
*netPaths=system->nodes[NET].nodes[n].paths[GPU];
@@ -261,6 +262,8 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
for (int i=0; i<count; i++) next[i] = scores[i].g;
}
*countPtr = count;
if (system->nodes[NVS].count) {
// NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
int index = gpu-system->nodes[GPU].nodes;
@@ -277,16 +280,18 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
} else {
firstGpus[0] = nextGpu; firstGpuCount = 1;
}
if (nextGpu == prevGpu && firstGpuCount == 2) firstGpuCount = 1;
int firstGpuRealCount = 0;
for (int g=0; g<firstGpuCount; g++) {
for (i=0; i<count && next[i] != firstGpus[g]; i++);
if (i<count) {
for (; i>0; i--) next[i] = next[i-1];
next[0] = firstGpus[g];
firstGpuRealCount++;
}
}
*countPtr = firstGpuRealCount;
}
*countPtr = count;
return ncclSuccess;
}
@@ -372,7 +377,6 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
return ncclSuccess;
}
// 2. Try to get better bandwidth
// Give a 5% perf bonus to paths not crossing nics
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
*copy = 1;
return ncclSuccess;
@@ -405,8 +409,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
localNetCount = 0;
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
for (int c = 0; c<MAXCHANNELS; c++) {
int netId;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
int64_t netId;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
localNetCount++;
@@ -427,7 +431,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
localNetCount = 0;
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
struct ncclTopoLinkList* paths = gpu->paths[NET];
for (int n=0; n<system->nodes[NET].count; n++) {
for (int n=0; n<system->nodes[NET].count && n<MAXCHANNELS; n++) {
if (paths[n].type == t) localNets[localNetCount++] = n;
}
// Append NICs to list
@@ -702,22 +706,25 @@ struct kvDict kvDictLinkType[] = {
ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int* inter = graph->inter+2*c;
int64_t* inter = graph->inter+2*c;
int* intra = graph->intra+ngpus*c;
int n=0, g=0;
for (int s=0; s<xmlChannel->nSubs; s++) {
struct ncclXmlNode* sub = xmlChannel->subs[s];
int dev;
NCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
int64_t dev;
const char* str;
NCCLCHECK(xmlGetAttrStr(sub, "dev", &str));
dev = strtol(str, NULL, 16);
if (strcmp(sub->name, "net") == 0) {
inter[n++] = dev;
} else if (strcmp(sub->name, "gpu") == 0) {
int rank = -1;
for (int g=0; g<ngpus; g++) {
if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
int systemId = NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[g].id);
if (NCCL_TOPO_ID(systemId, system->nodes[GPU].nodes[g].gpu.dev) == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
}
if (rank == -1) {
WARN("XML Import Channel : dev %d not found.", dev);
WARN("XML Import Channel : dev %ld not found.", dev);
return ncclSystemError;
}
intra[g++] = rank;
@@ -763,29 +770,33 @@ ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclT
ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
struct ncclXmlNode* xmlChannel;
int ngpus = system->nodes[GPU].count;
int* inter = graph->inter+2*c;
int64_t* inter = graph->inter+2*c;
int* intra = graph->intra+ngpus*c;
NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
struct ncclXmlNode* node;
if (system->nodes[NET].count) {
NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
NCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
NCCLCHECK(xmlSetAttrLong(node, "dev", inter[0]));
}
for (int g=0; g<ngpus; g++) {
NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
int dev = -1;
int64_t dev = -1;
for (int i=0; i<ngpus; i++) {
if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) {
int systemId = NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[i].id);
dev = NCCL_TOPO_ID(systemId, system->nodes[GPU].nodes[i].gpu.dev);
}
}
if (dev == -1) {
WARN("XML Export Channel : rank %d not found.", intra[g]);
return ncclInternalError;
}
NCCLCHECK(xmlSetAttrInt(node, "dev", dev));
NCCLCHECK(xmlSetAttrLong(node, "dev", dev));
if (graph->id == 3) break; // NVLS graphs only use the first GPU
}
if (system->nodes[NET].count) {
NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
NCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
NCCLCHECK(xmlSetAttrLong(node, "dev", inter[1]));
}
return ncclSuccess;
}
@@ -829,7 +840,7 @@ ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngp
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int64_t));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
@@ -841,7 +852,7 @@ float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 11.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
@@ -868,7 +879,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (str) {
INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
int nChannels;
NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
@@ -907,7 +918,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
int speedIndex = 0;
float maxBw = system->maxBw;
float totalBw = system->totalBw;
if (ngpus == 1 || graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
if (ngpus > 1 && graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
@@ -926,7 +937,7 @@ search:
for (int g=0; g<ngpus; g++) {
printf("%d ", graph->intra[c*ngpus+g]);
}
printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
printf("[%lx %lx]", graph->inter[c*2+0], graph->inter[c*2+1]);
printf("\n");
}
#endif
@@ -1041,7 +1052,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
sprintf(line, "%2d :", c);
int offset = strlen(line);
if (system->nodes[NET].count > 0) {
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]);
offset = strlen(line);
}
for (int i=0; i<ngpus; i++) {
@@ -1049,7 +1060,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
offset = strlen(line);
}
if (system->nodes[NET].count > 0) {
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]);
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s", line);
@@ -1062,7 +1073,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
if (str) {
INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
free(xml);
@@ -1072,11 +1083,11 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
#include "comm.h"
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int* dev) {
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int64_t* netId) {
ncclResult_t ret = ncclSuccess;
int localRanks = comm->topo->nodes[GPU].count;
int netNum = 0;
int net[MAXCHANNELS];
int64_t net[MAXCHANNELS];
for (int c = 0; c < graph->nChannels; c++) {
if (graph->intra[c * localRanks] == comm->rank) {
@@ -1084,7 +1095,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
}
}
if (netNum) {
*dev = net[channelId % netNum];
*netId = net[channelId % netNum];
} else {
ret = ncclInternalError;
goto fail;
@@ -1100,23 +1111,30 @@ fail:
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank) {
int64_t netId = -1;
int netDev = -1;
if (graph) {
// Honor the net device in the graph
int channel = channelId%graph->nChannels;
int ngpus = comm->topo->nodes[GPU].count;
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
*dev = graph->inter[channel*2+index];
netId = graph->inter[channel*2+index];
} else {
NCCLCHECK(getNvlsNetDev(comm, graph, channelId, dev));
NCCLCHECK(getNvlsNetDev(comm, graph, channelId, &netId));
}
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
NCCLCHECK(ncclTopoIdToNetDev(comm->topo, netId, &netDev));
if (dev) *dev = netDev;
if (id) *id = netId;
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, netId, proxyRank));
} else if (peerRank == -1) {
return ncclInternalError;
} else {
// Start with our local NIC and local Rank
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, dev));
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &netId, &netDev));
if (dev) *dev = netDev;
if (id) *id = netId;
*proxyRank = rank;
int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
@@ -1126,38 +1144,35 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
int nvmlDev = comm->peerInfo[peerRank].nvmlDev;
int localRank;
if (ncclTopoDevToRank(comm->topo, nvmlDev, &localRank) != ncclSuccess) return ncclSuccess;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netId, &netDev));
int n;
// Check that device exists on our node
if (ncclParamCrossNic() == 0) {
if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) {
WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
return ncclInvalidUsage;
}
*dev = netDev;
if (dev) *dev = netDev;
if (id) *id = netId;
}
if (pxnLevel == 1) {
int g, n;
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g));
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g;
if (gpu->paths[NET][n].type <= PATH_PXN) {
*dev = netDev;
if (dev) *dev = netDev;
if (id) *id = netId;
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
}
} else if (pxnLevel == 2) {
// Check which local GPU corresponds to that NIC and see if we can use PXN.
int n, g1, g2;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netDev, &g2));
NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2));
if (g2 != -1) {
struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
*proxyRank = peerGpu->gpu.rank;
*dev = netDev;
if (dev) *dev = netDev;
if (id) *id = netId;
return ncclSuccess;
}
}
+99 -41
Parādīt failu
@@ -15,13 +15,14 @@
#include <fcntl.h>
#include "xml.h"
#include "cpuset.h"
#include "bootstrap.h"
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
/******************************************************************/
/******************* Graph Creation Functions *********************/
@@ -156,9 +157,13 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int ind
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {
// Aggregate links into higher bw for NVLink
struct ncclTopoLink* link;
for (link = node->links; link->remNode; link++) {
for (link = node->links; link - node->links != NCCL_TOPO_MAX_LINKS && link->remNode; link++) {
if (link->remNode == remNode && link->type == type) break;
}
if (link - node->links == NCCL_TOPO_MAX_LINKS) {
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
return ncclInternalError;
}
if (link->remNode == NULL) node->nlinks++;
link->type = type;
link->remNode = remNode;
@@ -218,6 +223,10 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
struct ncclTopoNode* remNode = sub->links[l].remNode;
if (remNode == pciSwitch) continue;
// Add link from parent PCI switch -> PCI device
if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) {
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
return ncclInternalError;
}
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
pciSwitch->nlinks++;
// Update link from PCI device -> parent PCI switch
@@ -243,11 +252,13 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
struct ncclTopoNode* cpu1 = system->nodes[CPU].nodes+n;
for (int p=0; p<system->nodes[CPU].count; p++) {
if (n == p) continue;
struct ncclTopoNode* cpu2 = system->nodes[CPU].nodes+p;
if (n == p || (NCCL_TOPO_ID_SYSTEM_ID(cpu1->id) != NCCL_TOPO_ID_SYSTEM_ID(cpu2->id))) continue;
float bw;
NCCLCHECK(ncclTopoGetInterCpuBw(system->nodes[CPU].nodes+n, &bw));
NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, bw));
NCCLCHECK(ncclTopoGetInterCpuBw(cpu1, &bw));
NCCLCHECK(ncclTopoConnectNodes(cpu1, cpu2, LINK_SYS, bw));
}
}
return ncclSuccess;
@@ -255,13 +266,13 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
if (node->type == GPU) {
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
sprintf(line+offset, "%s/%lx-%lx (%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->gpu.rank);
} else if (node->type == CPU) {
sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
sprintf(line+offset, "%s/%lx-%lx (%d/%d/%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if (node->type == PCI) {
sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
sprintf(line+offset, "%s/%lx-%lx (%lx)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->pci.device);
} else {
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
sprintf(line+offset, "%s/%lx-%lx", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
}
INFO(NCCL_GRAPH, "%s", line);
for (int i=0; i<offset; i++) line[i] = ' ';
@@ -328,12 +339,13 @@ ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
return ncclSuccess;
}
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) {
int dev;
NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
struct ncclTopoNode* net;
NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev)));
net->net.dev = dev;
const char* str;
NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
if (str) sscanf(str, "0x%lx", &net->net.asic);
@@ -356,14 +368,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
return ncclSuccess;
}
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) {
for (int s=0; s<xmlNic->nSubs; s++) {
struct ncclXmlNode* xmlNet = xmlNic->subs[s];
if (strcmp(xmlNet->name, "net") != 0) continue;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
if (index == -1) continue;
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId));
}
return ncclSuccess;
}
@@ -382,7 +394,7 @@ struct kvDict kvDictPciGen[] = {
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) {
const char* str;
int type;
@@ -401,7 +413,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
int index;
NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
if (index == -1) return ncclSuccess;
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId)));
NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
}
struct ncclXmlNode* xmlNic = NULL;
@@ -411,14 +423,15 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
// Ignore sub device ID and merge multi-port NICs into one PCI device.
busId &= 0xfffffffffffffff0;
struct ncclTopoNode* nicNode = NULL;
NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
int64_t id = NCCL_TOPO_ID(systemId, busId);
NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id));
if (nicNode == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id));
node = nicNode; // Connect it to parent later on
}
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, systemId));
} else if (type == PCI) {
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId)));
NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 48;
NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
@@ -430,7 +443,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
for (int s=0; s<xmlPci->nSubs; s++) {
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
}
}
@@ -452,11 +465,25 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
ncclResult_t ncclGetSystemId(struct ncclTopoSystem* system, struct ncclXmlNode* xmlCpu, int* systemIdPtr) {
const char* hostHashStr;
NCCLCHECK(xmlGetAttr(xmlCpu, "host_hash", &hostHashStr));
uint64_t hostHash = hostHashStr ? strtoull(hostHashStr, NULL, 16) : 0;
int systemId;
for (systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == hostHash) break;
if (systemId == system->nHosts) system->hostHashes[system->nHosts++] = hostHash;
*systemIdPtr = systemId;
return ncclSuccess;
}
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
int numaId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
int systemId;
NCCLCHECK(ncclGetSystemId(system, xmlCpu, &systemId));
struct ncclTopoNode* cpu;
NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, NCCL_TOPO_ID(systemId, numaId)));
const char* str;
NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
if (str != NULL) {
@@ -482,26 +509,27 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
}
for (int s=0; s<xmlCpu->nSubs; s++) {
struct ncclXmlNode* node = xmlCpu->subs[s];
if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId));
if (strcmp(node->name, "nic") == 0) {
struct ncclTopoNode* nic = NULL;
NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
if (nic == NULL) {
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0)));
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
}
NCCLCHECK(ncclTopoAddNic(node, system, nic));
NCCLCHECK(ncclTopoAddNic(node, system, nic, systemId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
if (strcmp(node->name, "nvlink") == 0) {
struct ncclTopoNode* gpu = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
pBusId = NCCL_TOPO_ID(systemId, pBusId);
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx", pBusId);
@@ -520,7 +548,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
NCCLCHECK(xmlGetAttrStr(node, "target", &target));
int64_t busId;
NCCLCHECK(busIdToInt64(target, &busId));
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
} else if (targetType == CPU) {
// NVL connection to the local CPU
NCCLCHECK(findLocalCpu(gpu, &remote));
@@ -539,20 +567,24 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
}
}
} else {
if (strcmp(node->name, "cpu") == 0) {
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
}
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId, systemId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
if (strcmp(node->name, "c2c") == 0) {
struct ncclTopoNode* gpu = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
pBusId = NCCL_TOPO_ID(systemId, pBusId);
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx", pBusId);
@@ -569,25 +601,31 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
} else {
if (strcmp(node->name, "cpu") == 0) {
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
}
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId));
NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId, systemId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {
NCCLCHECK(ncclCalloc(topoSystem, 1));
struct ncclTopoSystem* system = *topoSystem;
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
for (int s=0; s<topNode->nSubs; s++) {
struct ncclXmlNode* node = topNode->subs[s];
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
}
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL));
for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
@@ -633,7 +671,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
@@ -707,13 +745,32 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
if (comm->MNNVL) {
// MNNVL clique support
char* mem;
NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* cliqueXml;
NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
for (int i = 0; i < comm->clique.size; i++) {
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
}
free(xml);
xml = cliqueXml;
}
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
}
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
free(xml);
return ncclSuccess;
}
@@ -761,7 +818,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int* localNets;
@@ -773,15 +830,16 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
net += channelId%(DIVUP(localNetCount,localGpuCount));
*id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
free(localNets);
free(localGpus);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
int netIndex;
NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
int* localGpus = NULL;
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
@@ -789,9 +847,9 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gp
for (int lg=0; lg<localGpuCount; lg++) {
int g = localGpus[lg];
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int id;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
if (net == id) {
int64_t id;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
if (netId == id) {
*gpuIndex = g;
free(localGpus);
return ncclSuccess;
+26 -3
Parādīt failu
@@ -88,7 +88,7 @@ struct ncclTopoLink {
float bw;
struct ncclTopoNode* remNode;
};
#define NCCL_TOPO_MAX_LINKS 32
#define NCCL_TOPO_MAX_LINKS 128
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
struct ncclTopoLinkList {
@@ -103,6 +103,10 @@ struct ncclTopoLinkList {
#define NCCL_TOPO_UNDEF (-1)
#define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff)
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid)
struct ncclTopoNode {
int type;
int64_t id;
@@ -115,6 +119,7 @@ struct ncclTopoNode {
int gdrSupport;
}gpu;
struct {
int dev; // Plugin dev number
uint64_t asic;
int port;
float bw;
@@ -147,6 +152,9 @@ struct ncclTopoNodeSet {
};
struct ncclTopoSystem {
int systemId;
uint64_t hostHashes[NCCL_TOPO_MAX_NODES];
int nHosts;
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
float maxBw;
float totalBw;
@@ -158,9 +166,11 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id)
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw);
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank);
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
#define NCCL_TOPO_XML_MAX_NODES 256
#define NCCL_GRAPH_XML_MAX_NODES 4096
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, uint64_t localHostHash);
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
@@ -191,6 +201,7 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) {
*rank = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
if (NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[i].id) != system->systemId) continue; // Only consider GPUs on our node
if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
*rank = system->nodes[GPU].nodes[i].gpu.rank;
return ncclSuccess;
@@ -199,6 +210,18 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
return ncclInternalError;
}
static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id, int* netDev) {
*netDev = -1;
for (int i=0; i<system->nodes[NET].count; i++) {
if (system->nodes[NET].nodes[i].id == id) {
*netDev = system->nodes[NET].nodes[i].net.dev;
return ncclSuccess;
}
}
WARN("Could not find NET with id %lx\n", id);
return ncclInternalError;
}
// Returns NVLink bw in GB/s
static float ncclTopoNVLinkBw(int cudaCompCap) {
return
+8 -11
Parādīt failu
@@ -66,7 +66,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
/* NVLS */ { 0, 0, 23 }, /* NVLSTree */ { 0, 0, 23 } },
/* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
@@ -86,7 +86,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
static const double llMaxBws[3][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
/* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}
};
static const double perChMaxRingLL128Bws[3][3] = {
@@ -132,8 +132,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
// MNNVL support - treat as a single NVLink connected node
int nNodes = comm->MNNVL ? 1 : comm->nNodes;
int nNodes = comm->nNodes;
int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess;
@@ -178,7 +177,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
float busBw = graphs[a]->nChannels * bw;
// Various model refinements
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
@@ -190,7 +189,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
busBw = ppn * bw;
// AllGather/ReduceScatter requires 1:1 GPU:NIC
int nicPerNode = comm->collNetHeadsUniqueNum;
int nicPerNode = comm->collNetHeadsNum;
if (coll == ncclFuncAllGather && comm->nNodes > 1) {
if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0;
}
@@ -282,15 +281,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
}
// MNNVL: NVLS not yet supported
if (comm->nNodes == 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
// Disable CollNet if it is not supported
if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
// MNNVL: NVLS not yet supported
if (comm->nNodes > 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS] = 0;
if (nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
// If user has hard set NCCL_ALGO=COLLNET, ignore it
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
@@ -437,7 +434,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
int logSize = log2i(info->nBytes>>6);
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && (!info->comm->MNNVL && info->comm->nNodes > 1)
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
}
+86 -3
Parādīt failu
@@ -172,8 +172,8 @@ struct xmlHandler {
ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
while (1) {
if (xml->maxIndex == MAX_NODES) {
WARN("Error : XML parser is limited to 1024 nodes");
if (xml->maxIndex == xml->maxNodes) {
WARN("Error : XML parser is limited to %d nodes", xml->maxNodes);
return ncclInternalError;
}
struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
@@ -198,7 +198,13 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
int found = 0;
for (int h=0; h<nHandlers; h++) {
if (strcmp(node->name, handlers[h].name) == 0) {
if (head) head->subs[head->nSubs++] = node;
if (head) {
if (head->nSubs == MAX_SUBS) {
WARN("Error : XML parser is limited to %d subnodes", MAX_SUBS);
return ncclInternalError;
}
head->subs[head->nSubs++] = node;
}
node->parent = head;
node->nSubs = 0;
xml->maxIndex++;
@@ -218,6 +224,23 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
/* XML Writer */
/**************/
// exp == 1 -- serialize; exp == 0 -- deserialize
ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp) {
for (int n = 0; n < xml->maxIndex; n++) {
struct ncclXmlNode *node = &xml->nodes[n];
// For "parent", we shift the base by 1 so that we can distinguish actual
// NULL pointers from pointers pointing to the first node.
if (node->parent)
node->parent = (struct ncclXmlNode *) (exp ? ((uintptr_t)node->parent - base + 1) : (base - 1 + (uintptr_t)node->parent));
for (int s = 0; s < node->nSubs; s++) {
node->subs[s] = (struct ncclXmlNode *) (exp ? ((uintptr_t)node->subs[s] - base) : (base + (uintptr_t)node->subs[s]));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) {
for (int i=0; i<indent; i++) fprintf(file, " ");
fprintf(file, "<%s", node->name);
@@ -249,6 +272,60 @@ ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml)
return ncclSuccess;
}
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(dst, "system", &topNode));
if (topNode == NULL) {
xmlAddTree(dst, NULL, src->nodes);
return ncclSuccess;
}
// Fuse the CPUs with the first XML
struct ncclXmlNode* srcCpu;
NCCLCHECK(xmlFindTag(src, "cpu", &srcCpu));
while (srcCpu) {
const char* srcNumaId;
const char* srcHostHash;
NCCLCHECK(xmlGetAttr(srcCpu, "numaid", &srcNumaId));
if (srcNumaId == NULL) {
WARN("TopoFuseXmls : could not find CPU numa ID.");
return ncclInternalError;
}
xmlGetAttr(srcCpu, "host_hash", &srcHostHash);
if (srcHostHash == NULL)
srcHostHash = "0";
// Search through the destination for a duplicate. Note that
// this makes the complexity of this whole function O(n^2), but n
// is expected to be small.
struct ncclXmlNode* dstCpu;
NCCLCHECK(xmlFindTag(dst, "cpu", &dstCpu));
while (dstCpu) {
const char* dstNumaId;
const char* dstHostHash;
NCCLCHECK(xmlGetAttr(dstCpu, "numaid", &dstNumaId));
if (dstNumaId == NULL) {
WARN("TopoFuseXmls : could not find CPU numa ID.");
return ncclInternalError;
}
xmlGetAttr(dstCpu, "host_hash", &dstHostHash);
if (dstHostHash == NULL)
dstHostHash = "0";
if (strcmp(srcNumaId, dstNumaId) == 0 && strcmp(srcHostHash, dstHostHash) == 0)
break;
NCCLCHECK(xmlFindNextTag(dst, "cpu", dstCpu, &dstCpu));
}
// Only add the CPU if no duplicate was found
if (dstCpu == NULL)
NCCLCHECK(xmlAddTree(dst, topNode, srcCpu));
NCCLCHECK(xmlFindNextTag(src, "cpu", srcCpu, &srcCpu));
}
return ncclSuccess;
}
/****************************************/
/* Parser rules for our specific format */
/****************************************/
@@ -556,6 +633,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
if (parent == NULL) {
NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
NCCLCHECK(xmlSetAttrLong(parent, "host_hash", getHostHash()));
NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
}
} else if (slashCount == 2) {
@@ -581,6 +659,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
NCCLCHECK(xmlSetAttrLong(parent, "host_hash", getHostHash()));
NCCLCHECK(xmlSetAttr(parent, "numaid", "-1"));
NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
}
@@ -595,6 +674,10 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
NCCLCHECK(xmlGetAttr(parent->subs[s], "busid", &busId));
if (busId != NULL && strcmp(newBusId, busId) < 0) { subIndex = s; break; }
}
if (parent->nSubs == MAX_SUBS) {
WARN("Error : XML parser is limited to %d subnodes", MAX_SUBS);
return ncclInternalError;
}
for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
parent->subs[subIndex] = pciNode;
parent->nSubs++;
+84 -7
Parādīt failu
@@ -10,13 +10,13 @@
#include "nccl.h"
#include "debug.h"
#include "checks.h"
#include "alloc.h"
#include <stdlib.h>
// A few constraints to make the implementation easy
#define MAX_STR_LEN 255
#define MAX_ATTR_COUNT 16
#define MAX_SUBS 32
#define MAX_NODES 1024
#define MAX_SUBS 128
#define NODE_TYPE_NONE 0
#define NODE_TYPE_OPEN 1
@@ -37,8 +37,8 @@ struct ncclXmlNode {
};
struct ncclXml {
struct ncclXmlNode nodes[MAX_NODES];
int maxIndex;
int maxIndex, maxNodes;
struct ncclXmlNode nodes[1];
};
/* File functions */
@@ -55,11 +55,27 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
/* Remove unneeded parts */
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
/* Fuse multiple system XMLs into one, skipping duplicate CPUs */
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src);
/* Relocate pointers in XML to (de-)serialize the structure */
ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp);
/**************/
/* XML Struct */
/* Functions */
/**************/
static size_t xmlMemSize(int maxNodes) {
return offsetof(struct ncclXml, nodes) + sizeof(struct ncclXmlNode)*maxNodes;
}
static ncclResult_t xmlAlloc(struct ncclXml** xml, int maxNodes) {
char* mem;
NCCLCHECK(ncclCalloc(&mem, xmlMemSize(maxNodes)));
*xml = (struct ncclXml*)mem;
(*xml)->maxNodes = maxNodes;
return ncclSuccess;
}
static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) {
*index = -1;
const int nAttrs = node->nAttrs;
@@ -101,6 +117,13 @@ static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* a
return ncclSuccess;
}
static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) {
const char* str;
NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
*value = strtol(str, NULL, 0);
return ncclSuccess;
}
static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
const char* str;
@@ -121,6 +144,18 @@ static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct
return ncclSuccess;
}
static ncclResult_t xmlFindNextTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode* prev, struct ncclXmlNode** node) {
*node = NULL;
for (int i=prev-xml->nodes+1; i<xml->maxIndex; i++) {
struct ncclXmlNode* n = xml->nodes+i;
if (strcmp(n->name, tagName) == 0) {
*node = n;
return ncclSuccess;
}
}
return ncclSuccess;
}
static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) {
*node = NULL;
for (int i=0; i<xml->maxIndex; i++) {
@@ -188,6 +223,19 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
return ncclSuccess;
}
static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrName, const int64_t value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
@@ -234,8 +282,8 @@ static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName
}
static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) {
if (xml->maxIndex == MAX_NODES) {
WARN("Error : too many XML nodes (max %d)", MAX_NODES);
if (xml->maxIndex == xml->maxNodes) {
WARN("Error : too many XML nodes (max %d)", xml->maxNodes);
return ncclInternalError;
}
struct ncclXmlNode* s = xml->nodes+xml->maxIndex++;
@@ -243,7 +291,13 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
s->nAttrs = 0;
*sub = s;
s->parent = parent;
if (parent) parent->subs[parent->nSubs++] = s;
if (parent) {
if (parent->nSubs == MAX_SUBS) {
WARN("Error : too many XML subnodes (max %d)", MAX_SUBS);
return ncclInternalError;
}
parent->subs[parent->nSubs++] = s;
}
strncpy(s->name, subName, MAX_STR_LEN);
s->name[MAX_STR_LEN] = '\0';
return ncclSuccess;
@@ -262,6 +316,29 @@ static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
return ncclSuccess;
}
static ncclResult_t xmlAddTree(struct ncclXml* dst, struct ncclXmlNode* parent, struct ncclXmlNode* srcNode) {
if (dst->maxIndex == dst->maxNodes) {
WARN("Error : too many XML nodes (max %d)", dst->maxNodes);
return ncclInternalError;
}
struct ncclXmlNode* dstNode = dst->nodes+dst->maxIndex++;
*dstNode = *srcNode;
dstNode->parent = parent;
if (parent) {
if (parent->nSubs == MAX_SUBS) {
WARN("Error : too many XML subnodes (max %d)", MAX_SUBS);
return ncclInternalError;
}
parent->subs[parent->nSubs++] = dstNode;
}
dstNode->nSubs = 0;
// Recursively copy the subtree(s)
for (int i=0; i<srcNode->nSubs; i++)
NCCLCHECK(xmlAddTree(dst, dstNode, srcNode->subs[i]));
return ncclSuccess;
}
// Dictionary for STR -> INT conversions. No dictionary size information,
// there needs to be a last element with str == NULL.
struct kvDict {
@@ -11,6 +11,7 @@
#include "info.h"
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname);
ncclResult_t ArgsCheck(struct ncclInfo* info);
ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);
+3 -1
Parādīt failu
@@ -24,7 +24,9 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag);
ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size);
ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
ncclResult_t bootstrapClose(void* commState);
+27 -3
Parādīt failu
@@ -87,6 +87,12 @@ struct ncclNodeRanks {
int* localRankToRank;
};
struct cliqueInfo {
int id;
int size;
int *ranks;
};
struct ncclDestructor {
struct ncclDestructor* next;
void* obj;
@@ -165,6 +171,14 @@ struct ncclNvlsMcHandleList {
size_t size;
};
struct ncclCollnetHandleList {
struct ncclCollnetHandleList *next;
void* collnetHandle;
size_t size;
const void* buffer;
struct ncclProxyConnector* proxyconn;
};
struct ncclKernelPlan {
// A kernel plan is also a callback that reclaims itself. Hence this must
// be the first member.
@@ -188,6 +202,7 @@ struct ncclKernelPlan {
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
struct ncclIntruQueue<struct ncclCollnetHandleList, &ncclCollnetHandleList::next> collnetHandleQueue;
struct Channel {
int nWork;
@@ -202,7 +217,10 @@ struct ncclKernelPlan {
size_t maxBytesPerChannel;
};
#define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28.
struct ncclComm {
uint64_t startMagic;
struct ncclMemoryStack memPermanent, memScoped;
// List of destructors to run when comm is destructed
struct ncclDestructor* destructorHead;
@@ -245,7 +263,10 @@ struct ncclComm {
int* localRankToRank;
// localRanks and localRanktoRank for all nodes
struct ncclNodeRanks* nodeRanks;
int MNNVL; // MNNVL: Multi-Node NVLink
// MNNVL: Multi-Node NVLink
int MNNVL; // true when MNNVL is available
struct cliqueInfo clique; // Our MNNVL clique information
int cliqueRank; // Our rank within the MNNVL clique
bool checkPointers;
bool dmaBufSupport;
@@ -257,7 +278,6 @@ struct ncclComm {
int nChannels; // connection nChannels
int collChannels; // enqueue nChannels
int nvlsChannels; // enqueue nChannels
int collNetChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
@@ -269,6 +289,7 @@ struct ncclComm {
// Buffer sizes
int buffSizes[NCCL_NUM_PROTOCOLS];
int p2pChunkSize;
int nvlsChunkSize;
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -315,11 +336,11 @@ struct ncclComm {
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
// Whether this communicator uses collNet
int collNetSupport;
bool collNetRegSupport;
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
int intraHighestTransportType;
int* collNetHeads;
int collNetHeadsNum;
int collNetHeadsUniqueNum;
int* collNetDenseToUserRank;
int* collNetUserToDenseRank;
/* sharable collNet proxy progress resource. */
@@ -336,6 +357,7 @@ struct ncclComm {
struct ncclMemoryPool memPool_ncclKernelPlan;
struct ncclMemoryPool memPool_ncclPointerList;
struct ncclMemoryPool memPool_ncclNvlsHandleList;
struct ncclMemoryPool memPool_ncclCollnetHandleList;
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
// this comm is not yet in a group.
struct ncclComm* groupNext;
@@ -368,8 +390,10 @@ struct ncclComm {
// Tuning plugin
ncclTuner_t* tuner;
void *tunerContext;
// buffer registration cache
struct ncclRegCache regCache;
uint64_t endMagic;
};
enum ncclLaunchMode {
+30 -40
Parādīt failu
@@ -20,10 +20,6 @@ extern int ncclCuMemEnable();
// Handle type used for cuMemCreate()
extern CUmemAllocationHandleType ncclCuMemHandleType;
#else
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
#endif
#define CUPFN(symbol) pfn_##symbol
@@ -69,53 +65,47 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
} \
} while(0)
#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
#if CUDART_VERSION >= 11030
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
// cuMem API support
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemMap);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
#if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
#endif
#endif
/* CUDA Driver functions loaded with dlsym() */
DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
ncclResult_t ncclCudaLibraryInit(void);
extern int ncclCudaDriverVersionCache;
+19 -2
Parādīt failu
@@ -84,6 +84,15 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
#define NCCL_IPC_READ 0x10
#define NCCL_NVLS_MIN_POLL 0x20
#define NCCL_MAX_COLLNET_SIZE (1L << 29)
enum ncclRegBufferType {
NCCL_REGULAR_BUFFER = 0,
NCCL_IPC_REG_BUFFER = 1,
NCCL_NVLS_REG_BUFFER = 2,
NCCL_COLLNET_REG_BUFFER = 3
};
struct ncclConnInfo {
// Regular comm mechanism
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
@@ -93,6 +102,7 @@ struct ncclConnInfo {
int flags; // Direct communication / other flags
int shared; // Buffers are shared
int stepSize; // Step size for the SIMPLE buffer
void **ptrExchange; // Pointer exchange for direct communication
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
@@ -157,7 +167,7 @@ struct ncclDirect {
int down[NCCL_MAX_DIRECT_ARITY];
};
#define NCCL_MAX_NVLS_ARITY 8
#define NCCL_MAX_NVLS_ARITY 32
#define NCCL_MAX_NVLS_TREE_ARITY 3
struct ncclNvls {
int out;
@@ -171,6 +181,12 @@ struct ncclNvls {
int nNodes;
};
#if __CUDA_ARCH__ >= 900
#define NCCL_MAX_ARITY NCCL_MAX_NVLS_ARITY
#else
#define NCCL_MAX_ARITY NCCL_MAX_DIRECT_ARITY
#endif
#define NCCL_MAX_CONNS 2
struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS];
@@ -212,9 +228,10 @@ struct ncclWorkElem {
union {
uint8_t flagBits;
struct {
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, oneNode:1;
uint8_t isUsed:1, redOpArgIsPtr:1, oneNode:1;
};
};
uint8_t regUsed;
uint8_t nWarps;
uint8_t direct;
uint32_t root;
+6 -6
Parādīt failu
@@ -31,10 +31,10 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
int ncclPxnDisable(struct ncclComm* comm);
@@ -56,8 +56,8 @@ ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vend
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev);
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
#define NCCL_TOPO_MAX_NODES 256
@@ -88,7 +88,7 @@ struct ncclTopoGraph {
int sameChannels;
int nHops;
int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
int inter[MAXCHANNELS*2];
int64_t inter[MAXCHANNELS*2];
};
ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
@@ -110,7 +110,7 @@ struct ncclTopoRanks {
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs);
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
#include "info.h"
+3 -7
Parādīt failu
@@ -31,13 +31,6 @@ typedef enum : uint8_t {
ncclPatternRecv
} ncclPattern_t;
enum ncclRegBufferType {
NCCL_REGULAR_BUFFER = 0,
NCCL_IPC_REG_BUFFER = 1,
NCCL_NVLS_REG_BUFFER = 2,
NCCL_REG_BUFFER_NUM = 3
};
// Used to pass NCCL call information between functions
struct ncclInfo {
ncclFunc_t coll;
@@ -70,6 +63,9 @@ struct ncclInfo {
ncclRegBufferType regBufType;
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
// collnet buffer reg handles
void* sendMhandle;
void* recvMhandle;
// Need to initialize
int nThreads;
int nChannels;
@@ -8,7 +8,7 @@
#define NCCL_DEBUG_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+14 -9
Parādīt failu
@@ -17,13 +17,17 @@ typedef struct {
const char* name;
// Initializes tuner states.
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// nNodes: number of nodes in current communicator.
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
// Inputs:
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// - nNodes: number of nodes in current communicator.
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
// Outputs:
// - context: tuner context object
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetTypeSupport: whether collnet supports this type
@@ -40,16 +44,17 @@ typedef struct {
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
ncclResult_t (*destroy)();
} ncclTuner_v1_t;
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
typedef ncclTuner_v1_t ncclTuner_t;
typedef ncclTuner_v2_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#endif
+18 -2
Parādīt failu
@@ -33,7 +33,6 @@ union ncclProxyOpSpecifics {
struct ncclProxyOp {
struct ncclProxyConnection* connection;
void* buffer;
ssize_t nbytes;
uint64_t opCount;
int root;
@@ -49,6 +48,11 @@ struct ncclProxyOp {
uint8_t /*ncclPattern_t*/ pattern;
uint8_t protocol;
uint8_t reg;
// collnet buffer reg handles
void* sendMhandle;
void* recvMhandle;
uint8_t* sendbuff;
uint8_t* recvbuff;
union ncclProxyOpSpecifics specifics;
@@ -58,8 +62,14 @@ struct ncclProxyOp {
struct ncclProxySubArgs {
struct ncclProxyConnection* connection;
int reg;
void* buffer;
// p2p mhandle
void* mhandle;
// collnet handles
void* sendMhandle;
void* recvMhandle;
uint8_t* sendbuff;
uint8_t* recvbuff;
size_t offset;
int channelId;
int nsteps;
ssize_t nbytes;
@@ -88,6 +98,10 @@ struct ncclProxyArgs {
int sliceSteps;
int chunkSteps;
int chunkSize;
size_t totalSendSize;
size_t totalRecvSize;
size_t sendSizePerRound;
size_t recvSizePerRound;
uint8_t /*ncclDataType_t*/ dtype;
uint8_t /*ncclDevRedOp_t*/ redOp;
uint8_t /*ncclPattern_t*/ pattern;
@@ -302,6 +316,8 @@ enum ncclProxyMsgType {
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8,
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
ncclProxyMsgRegister = 10,
ncclProxyMsgDeregister = 11
};
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
+5 -1
Parādīt failu
@@ -5,7 +5,8 @@ enum {
NET_REG_COMPLETE = 0x01,
NVLS_REG_COMPLETE = 0x02,
NVLS_REG_POSSIBLE = 0x04,
NVLS_REG_NO_SUPPORT = 0x08
NVLS_REG_NO_SUPPORT = 0x08,
COLLNET_REG_COMPLETE = 0x10
};
struct ncclReg {
@@ -26,6 +27,9 @@ struct ncclReg {
int dev;
CUmemGenericAllocationHandle mcHandle;
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
// collnet reg
void* collnetHandle;
struct ncclProxyConnector* proxyconn;
};
struct ncclRegCache {
+1
Parādīt failu
@@ -92,6 +92,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
#endif
+6 -10
Parādīt failu
@@ -95,6 +95,8 @@ struct ncclTransportComm {
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
ncclResult_t (*proxyRegister)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
ncclResult_t (*proxyDeregister)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done);
};
struct ncclTransport {
@@ -107,15 +109,6 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
#define USE_POSIX_FD 1
#if USE_POSIX_FD
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
#else
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
#endif
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
@@ -124,7 +117,10 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle);
ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle);
#endif
+2 -2
Parādīt failu
@@ -15,8 +15,8 @@
// Attempts to load NCCL tuner from environmental variable.
// Returns ncclSuccess if the correct tuner symbol has been found and
// successully loaded. Otherwise returns an error and also logs the error.
ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner);
// Cleans up NCCL tuner plugin.
ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner);
#endif
+168 -114
Parādīt failu
@@ -117,6 +117,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
// Important that this does not trash intraComm0.
comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1;
comm->startMagic = comm->endMagic = 0;
}
#undef NCCL_NO_OPTIMIZE
@@ -280,7 +281,6 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
/* comm must be ready, or error will be reported */
ncclResult_t ret = ncclSuccess;
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
ncclGroupJobAbort(comm->groupJob);
} else {
@@ -351,6 +351,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList);
ncclMemoryPoolConstruct(&comm->memPool_ncclCollnetHandleList);
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -560,9 +561,8 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
}
// MNNVL support
if (!comm->MNNVL && comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
else if (comm->MNNVL || ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
// Make sure P2P chunksize is not larger than coll chunksize.
@@ -584,16 +584,38 @@ NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
static ncclResult_t collNetInitRailRankMap(ncclComm_t comm) {
int rank = comm->rank;
uint64_t nonHeadMask = (1ull << comm->localRanks) - 1;
comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
// initialize collNetUserToDenseRank[rank]
comm->collNetUserToDenseRank[rank] = -1;
for (int h = 0; h < comm->collNetHeadsNum; h++) {
nonHeadMask ^= 1ull << comm->rankToLocalRank[comm->collNetHeads[h]];
if (comm->collNetHeads[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
}
if (comm->collNetUserToDenseRank[rank] == -1) {
comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull << comm->localRank) - 1));
}
comm->collNetUserToDenseRank[rank] += comm->node * comm->localRanks;
NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
for (int r = 0; r < comm->nRanks; r++) {
comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
}
return ncclSuccess;
}
static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* collNetGraph) {
ncclResult_t ret = ncclSuccess;
int* heads = NULL;
int rank = comm->rank;
int collNetSetupFail = 0;
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
// Find all head ranks
int nHeads = collNetGraph->nChannels;
int nHeadsUnique = 0;
int headsUnique[NCCL_MAX_LOCAL_RANKS];
int* headsUnique = NULL;
int highestTransportType0, highestTransportType1;
char line[1024];
bool share;
@@ -604,27 +626,26 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
};
struct collnetShareInfo* infos = NULL;
NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&headsUnique, collNetGraph->nChannels), ret, fail);
{ uint64_t mask = 0;
// Head GPU index is always 0
for (int c = 0; c < nHeads; c++) {
heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
assert(comm->rankToNode[heads[c]] == comm->node);
for (int c = 0; c < collNetGraph->nChannels; c++) {
int head = collNetGraph->intra[c * comm->localRanks + 0];
assert(comm->rankToNode[head] == comm->node);
uint64_t mask0 = mask;
mask |= 1ull<<comm->rankToLocalRank[heads[c]];
if (mask != mask0) headsUnique[nHeadsUnique++] = heads[c];
mask |= 1ull<<comm->rankToLocalRank[head];
if (mask != mask0) headsUnique[nHeadsUnique++] = head;
}
}
comm->collNetHeads = heads;
comm->collNetHeadsNum = nHeads;
comm->collNetHeadsUniqueNum = nHeadsUnique;
comm->collNetHeads = headsUnique;
comm->collNetHeadsNum = nHeadsUnique;
if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) {
NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
/* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
* based on heads with the same head position in each node, as long as the collnet heads of child comm
* can match parent's heads, we can let child communicator share parent's collnet resources. */
for (int h = 0; h < nHeads; ++h) {
for (int h = 0; h < nHeadsUnique; ++h) {
int prev = INT_MIN;
struct collnetShareInfo* myinfo;
@@ -632,7 +653,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
myinfo = infos + comm->rank;
memset(myinfo, 0, sizeof(struct collnetShareInfo));
/* find the child head position in parent collnet heads. */
if (heads[h] == comm->rank) {
if (headsUnique[h] == comm->rank) {
myinfo->headPosition = -1;
myinfo->isMaster = 1;
for (int th = 0; th < parent->collNetHeadsNum; ++th)
@@ -658,10 +679,11 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
if (share) {
if (myinfo->isMaster) {
comm->collNetSharedRes = parent->collNetSharedRes;
comm->collNetChannels = std::min(comm->nChannels, parent->collNetSharedRes->nChannels);
for (int c = 0; c < comm->collNetChannels; ++c)
for (int c = 0; c < comm->nChannels; ++c)
NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
}
NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
} else {
/* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot
* share the sharp resource from parent, we cannot use sharp in this case. This restriction might be
@@ -677,35 +699,19 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
} else {
/* this allocated buffer will be freed on proxy side */
NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
comm->collNetSharedRes->nChannels = comm->nChannels;
comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
{ // initialize collNetUserToDenseRank[rank]
uint64_t nonHeadMask = (1ull<<comm->localRanks)-1;
comm->collNetUserToDenseRank[rank] = -1;
for (int h=0; h < nHeadsUnique; h++) {
nonHeadMask ^= 1ull<<comm->rankToLocalRank[headsUnique[h]];
if (headsUnique[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
}
if (comm->collNetUserToDenseRank[rank] == -1) {
comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull<<comm->localRank)-1));
}
comm->collNetUserToDenseRank[rank] += comm->node*comm->localRanks;
}
NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
for (int r=0; r < comm->nRanks; r++) {
comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
}
NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
for (int c = 0; c < comm->collNetChannels; c++) {
for (int c = 0; c < comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels + c;
NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
for (int h = 0; h < nHeads; h++) {
const int head = heads[h];
collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
for (int h = 0; h < nHeadsUnique; h++) {
const int head = headsUnique[h];
ncclConnect connect;
collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv, &connect);
if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend, &connect);
}
// Verify CollNet setup across ranks after trying the first channel
if (c == 0) {
@@ -727,7 +733,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
bool isHead = false;
matrix = nullptr;
NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end);
for (int h = 0; h < nHeads; h++) isHead |= (heads[h] == comm->rank);
for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank);
if (isHead) {
for (int ty=0; ty < ncclNumTypes; ty++) {
for (int i=0; i < 4; i++) {
@@ -817,7 +823,72 @@ fail:
}
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
NCCL_PARAM(MNNVL, "MNNVL", -2);
NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
// Determine if MNNVL support is available
static int checkMNNVL(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
// MNNVL requires cuMem to be enabled
if (!ncclCuMemEnable()) return 0;
// MNNVL also requires FABRIC handle support
int cudaDev;
int flag = 0;
CUdevice currentDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
if (!flag) return 0;
// Check that all ranks have initialized the fabric fully
for (int i = 0; i < comm->nRanks; i++) {
if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0;
}
// Determine our MNNVL domain/clique
NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail);
comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
for (int i = 0; i < comm->nRanks; i++) {
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
// Check if the cluster UUID and cliqueId match
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail;
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
if (i == comm->rank) {
comm->cliqueRank = comm->clique.size;
}
comm->clique.ranks[comm->clique.size++] = i;
}
}
// Determine whether to enable MNNVL or not
comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable();
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
if (comm->MNNVL) {
// Force the CUMEM handle type to be FABRIC for MNNVL
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
}
return comm->MNNVL;
fail:
if (comm->clique.ranks) free(comm->clique.ranks);
return 0;
}
#else
static int checkMNNVL(struct ncclComm* comm) {
return 0;
}
#endif
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
// We use 2 AllGathers
@@ -842,6 +913,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
float bwInter;
int typeIntra;
int typeInter;
int crossNic;
};
struct allGatherInfo {
@@ -875,61 +947,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
// AllGather1 - end
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
// MNNVL support
if (nNodes > 1) {
int cliqueSize = 0;
comm->MNNVL = 0;
// Determine the size of the MNNVL domain/clique
for (int i = 0; i < nranks; i++) {
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[rank].fabricInfo;
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
// Check that the Fabric state is fully initialized
if (fabricInfo2->state != NVML_GPU_FABRIC_STATE_COMPLETED) continue;
// Check that the cluster UUID and cliqueId match in each rank
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) continue;
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
cliqueSize++;
}
}
// Determine whether this is a MNNVL system
comm->MNNVL = ncclParamMNNVL() < 0 ? cliqueSize == comm->nRanks : ncclParamMNNVL();
// MNNVL requires cuMem to be enabled
if (!ncclCuMemEnable()) comm->MNNVL = 0;
if (comm->MNNVL) {
// MNNVL also requires FABRIC handle support
int cudaDev;
int flag = 0;
CUdevice currentDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
if (!flag)
comm->MNNVL = 0;
else
// Force the handle type to be FABRIC for MNNVL
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
}
if (ncclParamMNNVL() == 1 && !comm->MNNVL) {
WARN("MNNVL is not supported on this system");
ret = ncclSystemError;
goto fail;
}
if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) {
// Return an error if the user specifically requested MNNVL support
WARN("MNNVL is not supported on this system");
ret = ncclSystemError;
goto fail;
}
#endif
do {
// Compute intra-process ranks
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap);
for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap);
for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
comm->nvlsRegSupport = 1;
for (int i = 0; i < nranks; i++) {
@@ -955,6 +985,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
}
}
// Buffer Registration is not supported with MNNVL
if (comm->MNNVL) comm->nvlsRegSupport = 0;
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
@@ -1065,6 +1099,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
}
comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
@@ -1137,10 +1172,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
}
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = 0;
}
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
if (comm->nChannels < nChannelsOrig) {
@@ -1156,17 +1192,22 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
comm->collNetSupport = 0;
}
comm->collNetRegSupport = true;
for (int n=0; n<comm->nNodes; n++) {
if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
comm->collNetSupport = 0;
break;
}
if (comm->nodeRanks[n].localRanks > 1) {
// As long as there is more than 1 rank on any node, we need to disable collnet reg
comm->collNetRegSupport = false;
}
}
}
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs), ret, fail);
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
// AllGather3 - end
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
@@ -1253,7 +1294,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// Compute time models for algorithm and protocol combinations
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->collNetChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
do { // Setup p2p structures in comm->tasks
struct ncclTasks* tasks = &comm->tasks;
@@ -1360,7 +1401,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
/* Local intra-node barrier */
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
@@ -1496,13 +1537,19 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
comm->cudaArch = cudaArch;
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
if (job->parent) {
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
} else {
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
}
NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail);
NCCLCHECKGOTO(ncclTunerPluginLoad(&comm->tuner), res, fail);
if (comm->tuner) {
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog));
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
}
// update communicator state
@@ -1519,8 +1566,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev);
}
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
if (job->parent) {
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init COMPLETE",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
} else {
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
}
exit:
if (job->newcomm) {
/* assign it to user pointer. */
@@ -1729,6 +1781,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
}
NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail);
comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption.
NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail);
NCCLCHECKGOTO(ncclCalloc((uint32_t**)&comm->abortFlagRefCount, 1), res, fail);
*comm->abortFlagRefCount = 1;
@@ -1926,8 +1979,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
}
if (comm->tuner != NULL) {
NCCLCHECK(comm->tuner->destroy());
NCCLCHECK(ncclCloseTunerPlugin(&comm->tuner));
NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
NCCLCHECK(ncclTunerPluginUnload(&comm->tuner));
}
NCCLCHECK(commFree(comm));
@@ -2142,7 +2195,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
ncclResult_t res = ncclSuccess;
NCCLCHECK(ncclGroupStartInternal());
NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail);
NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
@@ -2152,6 +2205,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank);
} else {
NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail);
childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
if (comm->config.splitShare) {
childComm->abortFlag = comm->abortFlag;
childComm->abortFlagRefCount = comm->abortFlagRefCount;
@@ -2224,7 +2278,7 @@ const char* ncclGetLastError(ncclComm_t comm) {
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
NCCLCHECK(CommCheck(comm, "ncclGetAsyncError", "comm"));
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
*asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
@@ -2236,7 +2290,7 @@ NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
NCCLCHECK(CommCheck(comm, "CommCount", "comm"));
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
/* init thread must be joined before we access the attributes of comm. */
@@ -2250,7 +2304,7 @@ NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm"));
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
NCCLCHECK(ncclCommEnsureReady(comm));
@@ -2263,7 +2317,7 @@ NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
NCCLCHECK(CommCheck(comm, "CommUserRank", "comm"));
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
NCCLCHECK(ncclCommEnsureReady(comm));
@@ -2302,7 +2356,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
if (mcSupport) {
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
memprop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
memprop.requestedHandleTypes = ncclCuMemHandleType;
memprop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
@@ -2314,7 +2368,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
mcprop.size = size;
/* device cnt is a dummy value right now, it might affect mc granularity in the future. */
mcprop.numDevices = dcnt;
mcprop.handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
mcprop.handleTypes = ncclCuMemHandleType;
mcprop.flags = 0;
CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+9
Parādīt failu
@@ -33,6 +33,15 @@ ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
return ncclSuccess;
}
ncclResult_t CommCheck(struct ncclComm* comm, const char* opname, const char* ptrname) {
NCCLCHECK(PtrCheck(comm, opname, ptrname));
if (comm->startMagic != NCCL_MAGIC || comm->endMagic != NCCL_MAGIC) {
WARN("Error: corrupted comm object detected");
return ncclInvalidArgument;
}
return ncclSuccess;
}
ncclResult_t ArgsCheck(struct ncclInfo* info) {
// First, the easy ones
if (info->root < 0 || info->root >= info->comm->nRanks) {
+85 -128
Parādīt failu
@@ -9,8 +9,6 @@
#include "param.h"
#include "cudawrap.h"
#include <dlfcn.h>
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
@@ -51,112 +49,119 @@ int ncclCuMemEnable() {
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
}
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
#if CUDART_VERSION >= 11030
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
DECLARE_CUDA_PFN(cuDeviceGet, 2000);
DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000);
DECLARE_CUDA_PFN(cuGetErrorString, 6000);
DECLARE_CUDA_PFN(cuGetErrorName, 6000);
DECLARE_CUDA_PFN(cuDeviceGet);
DECLARE_CUDA_PFN(cuDeviceGetAttribute);
DECLARE_CUDA_PFN(cuGetErrorString);
DECLARE_CUDA_PFN(cuGetErrorName);
/* enqueue.cc */
DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
DECLARE_CUDA_PFN(cuMemGetAddressRange);
/* proxy.cc */
DECLARE_CUDA_PFN(cuCtxCreate, 3020);
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
DECLARE_CUDA_PFN(cuCtxCreate);
DECLARE_CUDA_PFN(cuCtxDestroy);
DECLARE_CUDA_PFN(cuCtxGetCurrent);
DECLARE_CUDA_PFN(cuCtxSetCurrent);
DECLARE_CUDA_PFN(cuCtxGetDevice);
/* cuMem API support */
DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN(cuMemCreate, 10020);
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemMap, 10020);
DECLARE_CUDA_PFN(cuMemRelease, 10020);
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
DECLARE_CUDA_PFN(cuMemAddressReserve);
DECLARE_CUDA_PFN(cuMemAddressFree);
DECLARE_CUDA_PFN(cuMemCreate);
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity);
DECLARE_CUDA_PFN(cuMemExportToShareableHandle);
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle);
DECLARE_CUDA_PFN(cuMemMap);
DECLARE_CUDA_PFN(cuMemRelease);
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
DECLARE_CUDA_PFN(cuMemSetAccess);
DECLARE_CUDA_PFN(cuMemUnmap);
/* ncclMemAlloc/Free */
DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000);
DECLARE_CUDA_PFN(cuPointerGetAttribute);
#if CUDA_VERSION >= 11070
/* transport/collNet.cc/net.cc*/
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
DECLARE_CUDA_PFN(cuMulticastAddDevice);
DECLARE_CUDA_PFN(cuMulticastBindMem);
DECLARE_CUDA_PFN(cuMulticastBindAddr);
DECLARE_CUDA_PFN(cuMulticastCreate);
DECLARE_CUDA_PFN(cuMulticastGetGranularity);
DECLARE_CUDA_PFN(cuMulticastUnbind);
#endif
#endif
/* CUDA Driver functions loaded with dlsym() */
DECLARE_CUDA_PFN(cuInit, 2000);
DECLARE_CUDA_PFN(cuDriverGetVersion, 2020);
DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
#define CUDA_DRIVER_MIN_VERSION 11030
static void *cudaLib;
int ncclCudaDriverVersionCache = -1;
bool ncclCudaLaunchBlocking = false;
#if CUDART_VERSION >= 11030
#if CUDART_VERSION >= 12000
#define LOAD_SYM(symbol, ignore) do { \
cudaDriverEntryPointQueryResult driverStatus; \
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
if (!ignore) { \
WARN("Retrieve %s failed with %d status %d", #symbol, res, driverStatus); \
return ncclSystemError; } \
} } while(0)
#else
#define LOAD_SYM(symbol, ignore) do { \
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault); \
if (res != cudaSuccess) { \
if (!ignore) { \
WARN("Retrieve %s failed with %d", #symbol, res); \
return ncclSystemError; } \
} } while(0)
#endif
/*
Load the CUDA symbols
*/
static ncclResult_t cudaPfnFuncLoader(void) {
CUresult res;
#define LOAD_SYM(symbol, version, ignore) do { \
res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), version, 0); \
if (res != 0) { \
if (!ignore) { \
WARN("Retrieve %s version %d failed with %d", #symbol, version, res); \
return ncclSystemError; } \
} } while(0)
cudaError_t res;
LOAD_SYM(cuGetErrorString, 6000, 0);
LOAD_SYM(cuGetErrorName, 6000, 0);
LOAD_SYM(cuDeviceGet, 2000, 0);
LOAD_SYM(cuDeviceGetAttribute, 2000, 0);
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
LOAD_SYM(cuCtxCreate, 3020, 1);
LOAD_SYM(cuCtxDestroy, 4000, 1);
LOAD_SYM(cuCtxGetCurrent, 4000, 1);
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
LOAD_SYM(cuCtxGetDevice, 2000, 1);
LOAD_SYM(cuGetErrorString, 0);
LOAD_SYM(cuGetErrorName, 0);
LOAD_SYM(cuDeviceGet, 0);
LOAD_SYM(cuDeviceGetAttribute, 0);
LOAD_SYM(cuMemGetAddressRange, 1);
LOAD_SYM(cuCtxCreate, 1);
LOAD_SYM(cuCtxDestroy, 1);
LOAD_SYM(cuCtxGetCurrent, 1);
LOAD_SYM(cuCtxSetCurrent, 1);
LOAD_SYM(cuCtxGetDevice, 1);
/* cuMem API support */
LOAD_SYM(cuMemAddressReserve, 10020, 1);
LOAD_SYM(cuMemAddressFree, 10020, 1);
LOAD_SYM(cuMemCreate, 10020, 1);
LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
LOAD_SYM(cuMemMap, 10020, 1);
LOAD_SYM(cuMemRelease, 10020, 1);
LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
LOAD_SYM(cuMemSetAccess, 10020, 1);
LOAD_SYM(cuMemUnmap, 10020, 1);
LOAD_SYM(cuMemAddressReserve, 1);
LOAD_SYM(cuMemAddressFree, 1);
LOAD_SYM(cuMemCreate, 1);
LOAD_SYM(cuMemGetAllocationGranularity, 1);
LOAD_SYM(cuMemExportToShareableHandle, 1);
LOAD_SYM(cuMemImportFromShareableHandle, 1);
LOAD_SYM(cuMemMap, 1);
LOAD_SYM(cuMemRelease, 1);
LOAD_SYM(cuMemRetainAllocationHandle, 1);
LOAD_SYM(cuMemSetAccess, 1);
LOAD_SYM(cuMemUnmap, 1);
/* ncclMemAlloc/Free */
LOAD_SYM(cuPointerGetAttribute, 4000, 1);
LOAD_SYM(cuPointerGetAttribute, 1);
#if CUDA_VERSION >= 11070
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
LOAD_SYM(cuMulticastAddDevice, 12010, 1);
LOAD_SYM(cuMulticastBindMem, 12010, 1);
LOAD_SYM(cuMulticastBindAddr, 12010, 1);
LOAD_SYM(cuMulticastCreate, 12010, 1);
LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
LOAD_SYM(cuMulticastUnbind, 12010, 1);
LOAD_SYM(cuMulticastAddDevice, 1);
LOAD_SYM(cuMulticastBindMem, 1);
LOAD_SYM(cuMulticastBindAddr, 1);
LOAD_SYM(cuMulticastCreate, 1);
LOAD_SYM(cuMulticastGetGranularity, 1);
LOAD_SYM(cuMulticastUnbind, 1);
#endif
return ncclSuccess;
}
@@ -171,47 +176,12 @@ static void initOnceFunc() {
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
} while (0);
CUresult res;
/*
* Load CUDA driver library
*/
char path[1024];
const char *ncclCudaPath = ncclGetEnv("NCCL_CUDA_PATH");
if (ncclCudaPath == NULL)
snprintf(path, 1024, "%s", "libcuda.so");
else
snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so");
(void) dlerror(); // Clear any previous errors
cudaLib = dlopen(path, RTLD_LAZY);
if (cudaLib == NULL) {
WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror());
goto error;
}
/*
* Load initial CUDA functions
*/
pfn_cuInit = (PFN_cuInit_v2000) dlsym(cudaLib, "cuInit");
if (pfn_cuInit == NULL) {
WARN("Failed to load CUDA missing symbol cuInit");
goto error;
}
pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion_v2020) dlsym(cudaLib, "cuDriverGetVersion");
if (pfn_cuDriverGetVersion == NULL) {
WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
goto error;
}
ncclResult_t ret = ncclSuccess;
int cudaDev;
int driverVersion;
res = pfn_cuDriverGetVersion(&driverVersion);
if (res != 0) {
WARN("cuDriverGetVersion failed with %d", res);
goto error;
}
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error); // Initialize the driver
CUDACHECKGOTO(cudaDriverGetVersion(&driverVersion), ret, error);
INFO(NCCL_INIT, "cudaDriverVersion %d", driverVersion);
if (driverVersion < CUDA_DRIVER_MIN_VERSION) {
@@ -220,19 +190,6 @@ static void initOnceFunc() {
goto error;
}
pfn_cuGetProcAddress = (PFN_cuGetProcAddress_v11030) dlsym(cudaLib, "cuGetProcAddress");
if (pfn_cuGetProcAddress == NULL) {
WARN("Failed to load CUDA missing symbol cuGetProcAddress");
goto error;
}
/*
* Required to initialize the CUDA Driver.
* Multiple calls of cuInit() will return immediately
* without making any relevant change
*/
pfn_cuInit(0);
#if CUDART_VERSION >= 11030
if (cudaPfnFuncLoader()) {
WARN("CUDA some PFN functions not found in the library");
@@ -243,7 +200,7 @@ static void initOnceFunc() {
// Determine whether we support the cuMem APIs or not
ncclCuMemSupported = ncclIsCuMemSupported();
initResult = ncclSuccess;
initResult = ret;
return;
error:
initResult = ncclSystemError;
+18
Parādīt failu
@@ -790,6 +790,24 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
return ncclSuccess;
}
ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize) {
int sendOffset = 0, recvOffset = 0;
if (sendSock == NULL || recvSock == NULL) {
WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
return ncclInternalError;
}
if (sendSock->state != ncclSocketStateReady || recvSock->state != ncclSocketStateReady) {
WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
return ncclInternalError;
}
while (sendOffset < sendSize || recvOffset < recvSize) {
if (sendOffset < sendSize) NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sendSock, sendPtr, sendSize, &sendOffset));
if (recvOffset < recvSize) NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, recvSock, recvPtr, recvSize, &recvOffset));
}
return ncclSuccess;
}
// Receive or detect connection closed
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
int offset = 0;
+160 -51
Parādīt failu
@@ -13,69 +13,178 @@
#include "nccl_tuner.h"
pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
static int tunerPluginRefCount = -1;
static int tunerPluginRefCount;
static void* tunerPluginLib = nullptr;
ncclTuner_t* tunerSymbol = nullptr;
ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
// Initialize to nullptr by default if plugin tuner cannot be loaded.
*tuner = nullptr;
if (tunerPluginRefCount == -2) return ncclSuccess;
pthread_mutex_lock(&tunerPluginLock);
if (tunerPluginRefCount == -1) {
tunerPluginRefCount = -2; // Default: no plugin, don't try again later
const char* name = getenv("NCCL_TUNER_PLUGIN");
if (name) {
INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
if (tunerPluginLib == nullptr) {
// dlopen does not guarantee to set errno, but dlerror only gives us a
// string, so checking errno doesn't hurt to try to provide a better
// error message
if (errno == ENOENT) {
INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
} else {
INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
}
} else {
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
if (tunerSymbol == nullptr) {
INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
dlclose(tunerPluginLib);
tunerPluginLib = nullptr;
} else {
INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
tunerPluginRefCount = 0;
}
}
static void* tryOpenDynamicLib(const char* name) {
if (nullptr == name || strlen(name) == 0) {
return nullptr;
}
void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
if (nullptr == handle) {
if (ENOENT == errno) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: No plugin found (%s)", name);
}
}
if (tunerPluginRefCount >= 0) {
*tuner = tunerSymbol;
INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name);
tunerPluginRefCount++;
}
pthread_mutex_unlock(&tunerPluginLock);
return ncclSuccess;
return handle;
}
ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) {
static void summarizeOpenTunerPluginLibErrors(char* pluginNames) {
const char *separator = " ";
int len = strlen(pluginNames);
// remove tail separator
pluginNames[len - 1] = '\0';
// remove last plugin name
while (len > 0 && pluginNames[--len] != *separator);
if (len > 0) {
pluginNames[len] = '\0';
}
// distinguish between one load attempt and multiple attempts
if (strstr(pluginNames, separator)) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames);
} else {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames);
}
}
static void* openTunerPluginLib(void) {
void *pluginLib;
#define MAX_PLUGIN_LOAD 4
int len;
char tunerPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
char *ptr = tunerPluginLibNameTried;
char tunerPluginLibName[PATH_MAX];
const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
if (envTunerPluginName && strlen(envTunerPluginName)) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
if (pluginLib) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName);
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
if (pluginLib) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
} else {
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so");
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
if (pluginLib) {
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
}
const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
if (envNetPluginName && strlen(envNetPluginName)) {
// Users are allowed to pack tuner into the net plugin
snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
if (pluginLib) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
if (pluginLib) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
} else {
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so");
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
if (pluginLib) {
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
}
summarizeOpenTunerPluginLibErrors(ptr);
tunerPluginLibName[0] = '\0';
return nullptr;
}
enum {
tunerPluginLoadFailed = -1,
tunerPluginLoadReady = 0,
tunerPluginLoadSuccess = 1,
};
ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) {
// Initialize to nullptr by default if plugin tuner cannot be loaded.
*tuner = nullptr;
static int status = tunerPluginLoadReady;
if (tunerPluginLoadFailed == status) {
return ncclSuccess;
}
pthread_mutex_lock(&tunerPluginLock);
if (tunerPluginLoadFailed == status) {
goto exit;
}
if (tunerPluginLoadSuccess == status) {
*tuner = tunerSymbol;
++tunerPluginRefCount;
goto exit;
}
tunerPluginLib = openTunerPluginLib();
if (nullptr == tunerPluginLib) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
goto fail;
}
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
if (tunerSymbol == nullptr) {
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find " NCCL_TUNER_PLUGIN_SYMBOL ", using internal tuner instead.");
dlclose(tunerPluginLib);
goto fail;
}
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
*tuner = tunerSymbol;
++tunerPluginRefCount;
status = tunerPluginLoadSuccess;
exit:
pthread_mutex_unlock(&tunerPluginLock);
return ncclSuccess;
fail:
tunerPluginLib = nullptr;
status = tunerPluginLoadFailed;
goto exit;
}
ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner) {
if (*tuner == nullptr) return ncclSuccess;
pthread_mutex_lock(&tunerPluginLock);
if (--tunerPluginRefCount == 0) {
if (tunerPluginLib == nullptr) {
WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n");
} else {
INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name);
dlclose(tunerPluginLib);
}
if (0 == (--tunerPluginRefCount)) {
INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
dlclose(tunerPluginLib);
tunerPluginLib = nullptr;
tunerSymbol = nullptr;
*tuner = nullptr;
tunerPluginRefCount = -1;
}
pthread_mutex_unlock(&tunerPluginLock);
return ncclSuccess;
-9
Parādīt failu
@@ -174,7 +174,6 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
/* Register CUDA buffer for zero-copy operation */
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
@@ -433,14 +432,6 @@ ncclResult_t pncclGroupStart();
ncclResult_t ncclGroupEnd();
ncclResult_t pncclGroupEnd();
/* Register CUDA buffer for zero-copy operation */
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
/* Deregister CUDA buffer */
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
#ifdef __cplusplus
} // end extern "C"
#endif
+78 -17
Parādīt failu
@@ -339,26 +339,87 @@ enum ncclNetState {
enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
ncclResult_t ncclNetPluginInit() {
char ncclNetPluginName[128];
const char* envPluginName = ncclGetEnv("NCCL_NET_PLUGIN");
if (envPluginName && strlen(envPluginName)) {
snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName);
} else {
sprintf(ncclNetPluginName, "libnccl-net.so");
static void* tryOpenDynamicLib(char* name) {
if (nullptr == name || strlen(name) == 0) {
return nullptr;
}
void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
if (netPluginLib == nullptr) {
// dlopen does not guarantee to set errno, but dlerror only gives us a
// string, so checking errno doesn't hurt to try to provide a better
// error message
if (errno == ENOENT) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : dlerror=%s No plugin found (%s), using internal implementation", dlerror(), ncclNetPluginName);
// exit(-1);
void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
if (nullptr == handle) {
if (ENOENT == errno) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: No plugin found (%s)", name);
} else {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin load returned %d : %s when loading %s", errno, dlerror(), name);
}
}
return handle;
}
static void summarizeOpenNetPluginErrors(char* pluginNames) {
const char *separator = " ";
int len = strlen(pluginNames);
// remove tail separator
pluginNames[len - 1] = '\0';
// remove last plugin name
while (len > 0 && pluginNames[--len] != *separator);
if (len > 0) {
pluginNames[len] = '\0';
}
// distinguish between one load attempt and multiple attempts
if (strstr(pluginNames, separator)) {
INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames);
} else {
INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames);
}
}
static void* openNetPluginLib(void) {
void *pluginLib;
#define MAX_PLUGIN_LOAD 2
int len;
char netPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
char *ptr = netPluginLibNameTried;
char netPluginLibName[PATH_MAX];
const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
if (envNetPluginName && strlen(envNetPluginName)) {
snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName);
pluginLib = tryOpenDynamicLib(netPluginLibName);
if (pluginLib) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
pluginLib = tryOpenDynamicLib(netPluginLibName);
if (pluginLib) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
} else {
snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so");
pluginLib = tryOpenDynamicLib(netPluginLibName);
if (pluginLib) {
return pluginLib;
}
len = PATH_MAX - strlen(ptr);
snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
}
summarizeOpenNetPluginErrors(ptr);
return nullptr;
}
ncclResult_t ncclNetPluginInit() {
void* netPluginLib = openNetPluginLib();
if (netPluginLib == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin.");
return ncclSuccess;
}
+17 -10
Parādīt failu
@@ -358,9 +358,13 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
sub->channelId = op->channelId;
sub->nsteps = op->nsteps;
sub->nbytes = op->nbytes;
sub->offset = 0;
sub->peer = op->root;
sub->reg = op->reg;
sub->buffer = op->buffer;
sub->sendMhandle = op->sendMhandle;
sub->recvMhandle = op->recvMhandle;
sub->sendbuff = op->sendbuff;
sub->recvbuff = op->recvbuff;
args->nsubs = subIndex+1;
if (subIndex) {
if ((args->sliceSteps != op->sliceSteps) ||
@@ -634,7 +638,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op,
if (ncclParamChunkSize() != 0) {
info->chunkSize = ncclParamChunkSize();
}
op->buffer = op->reg ? info->recvbuff : NULL;
op->recvbuff = op->reg ? (uint8_t*)info->recvbuff : NULL;
op->chunkSize = info->chunkSize;
op->nbytes = info->count;
@@ -820,7 +824,7 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
if (createThreadContext) {
if (proxyState->cudaCtx == NULL) {
if (CUPFN(cuCtxCreate(&proxyState->cudaCtx,
CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
createThreadContext = 0;
}
@@ -1083,7 +1087,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
ncclResult_t res = ncclSuccess;
struct ncclIpcSocket ipcSock = { 0 };
void *opId = (void*)((((uintptr_t)random()) << 32) | random());
void *opId;
NCCLCHECK(getRandomData(&opId, sizeof(opId)));
int rank = comm->topParentLocalRanks[comm->localRank];
struct ncclProxyState* sharedProxyState = comm->proxyState;
@@ -1365,6 +1370,12 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
else if (op->type == ncclProxyMsgInit) {
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
} else if (op->type == ncclProxyMsgRegister) {
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgRegister opId=%p op.reqBuff=%p, op->reqSize=%d, op->respSize=%d", op->opId, op->reqBuff, op->reqSize, op->respSize);
res = op->connection->tcomm->proxyRegister(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
} else if (op->type == ncclProxyMsgDeregister) {
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgDeregister opId=%p op.reqBuff=%p, op->reqSize=%d, op->respSize=%d", op->opId, op->reqBuff, op->reqSize, op->respSize);
res = op->connection->tcomm->proxyDeregister(op->connection, proxyState, op->reqBuff, op->reqSize, &done);
} else return ncclInternalError;
if (done) {
@@ -1435,6 +1446,8 @@ static bool proxyMatchOpType(int type) {
case ncclProxyMsgSetup:
case ncclProxyMsgConnect:
case ncclProxyMsgGetFd:
case ncclProxyMsgRegister:
case ncclProxyMsgDeregister:
return true;
default:
return false;
@@ -1663,12 +1676,6 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
// UDS support
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
// Seed the random number generator for UDS filename generation
struct timeval time;
gettimeofday(&time,NULL);
unsigned int seed = time.tv_sec*time.tv_usec;
seed ^= getpid();
srandom(seed);
return ncclSuccess;
}
+6 -3
Parādīt failu
@@ -34,7 +34,7 @@ ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, str
// Find local devices for p2p operations
for (int c=0; c<comm->p2pnChannels; c++) {
int dev;
if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, &dev) != ncclSuccess) goto end; // No local net
if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, NULL, &dev) != ncclSuccess) goto end; // No local net
ncclNetProperties_t props;
NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
@@ -152,7 +152,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
NCCLCHECK(ncclRegister(comm, buff, size, handle));
return ncclSuccess;
@@ -160,7 +160,7 @@ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, vo
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
struct ncclReg* reg = (struct ncclReg*)handle;
struct ncclRegCache* cache = &comm->regCache;
int slot;
@@ -175,6 +175,9 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
NCCLCHECK(ncclNvlsDeregBuffer(&reg->mcHandle, reg->regAddr, reg->dev, reg->regSize));
reg->regAddr = (CUdeviceptr)NULL;
}
if (reg->state & COLLNET_REG_COMPLETE) {
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
}
free(reg);
memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
cache->population -= 1;
+7 -19
Parādīt failu
@@ -229,7 +229,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
* others might still be trying to connect and import the buffer. No sync can lead to invalid
* shmem/cuda buffer. In addition, we also clear all connect masks and free each connectInfo array */
for (int i = 1; i < comm->nRanks; i++) {
int bootstrapTag = (i << 8) + (graph ? graph->id + 1 : 0);
int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
int flag = 0;
@@ -271,27 +271,19 @@ extern struct ncclTransport collNetTransport;
// All ranks must participate in collNetSetup call
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type) {
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
int fail = 1;
int rank = comm->rank;
int nranks = comm->nRanks;
int nMasters = comm->nNodes;
int rankInCollNet = -1;
int isMaster = (rank == masterRank) ? 1 : 0;
struct {
int collNetRank;
ncclConnect connect;
} sendrecvExchange;
// check if we can connect to collnet, whose root is the nranks-th rank
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
peerInfo->rank = nranks;
// send master receives connect info from peer recv master
if (isMaster && type == collNetSend) {
NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)));
rankInCollNet = sendrecvExchange.collNetRank;
TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, comm->node, nMasters, masterPeer);
}
// select
@@ -327,24 +319,20 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
c++;
}
}
if (isMaster) rankInCollNet = comm->node;
} else { // send side : copy in connect info received from peer recv master
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
if (isMaster) memcpy(masterConnects+comm->node, connect, sizeof(struct ncclConnect));
}
// connect
if (isMaster) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup);
struct ncclDevChannelPeer* devRoot;
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
if (isMaster && type == collNetRecv) {
sendrecvExchange.collNetRank = rankInCollNet;
memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect));
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer);
}
fail = 0;
cleanup:
+252 -34
Parādīt failu
@@ -9,6 +9,7 @@
#include "graph.h"
#include "proxy.h"
#include "gdrwrap.h"
#include "assert.h"
int64_t ncclParamGdrCopySyncEnable();
int64_t ncclParamGdrCopyFlushEnable();
@@ -151,8 +152,9 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
struct setupReq req = { 0 };
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
@@ -171,8 +173,9 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
struct setupReq req = { 0 };
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
// Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
@@ -696,8 +699,10 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
__sync_synchronize();
if (sub->reg == 0) {
resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
__sync_synchronize();
}
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
sub->posted += args->sliceSteps;
@@ -708,8 +713,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
if (connFifo[buffSlot].size != -1 && ((*recvTail > (sub->base+sub->received)))) {
if (args->coll != ncclFuncAllReduce) {
if ((connFifo[buffSlot].size != -1 || sub->reg) && ((*recvTail > (sub->base+sub->received)))) {
if (args->coll != ncclFuncAllReduce && sub->reg == 0) {
int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
if (sendEnd-sendBeg != connFifo[buffSlot].size) {
@@ -740,33 +745,89 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
reqFifo[group][buffSlot].size = recvEnd - recvBeg;
size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
if (sendBeg==sendEnd && recvBeg==recvEnd) {
if (sendBeg==sendEnd && recvBeg==recvEnd && sub->reg == 0) {
sub->requests[buffSlot] = nullptr; // trivally finished request
} else {
if (args->coll == ncclFuncAllReduce) {
int count = (sendEnd-sendBeg)/eltSize;
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region+sendBeg, region+recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
if (sub->reg) {
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
int count = (int)(nBytes / eltSize);
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff, sub->recvbuff, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, sub->requests + buffSlot));
if (sub->requests[buffSlot]) {
sub->nbytes -= nBytes;
sub->sendbuff += nBytes;
sub->recvbuff += nBytes;
}
} else {
int count = (sendEnd - sendBeg) / eltSize;
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests + buffSlot));
}
} else {
sizePerRank = args->specifics.collnetDirect.sizePerRank;
if (args->coll == ncclFuncAllGather) {
ncclNetSGE_v8_t recvParts;
recvParts.mhandle = recvMhandle;
recvParts.address = region + recvBeg;
recvParts.size = allEnd - allBeg;
NCCLCHECK(proxyState->ncclCollNet->iallgather(
resources->collNetComm, region+sendBeg, 1, &recvParts,
sizePerRank, allBeg, allEnd-allBeg,
sendMhandle, sub->requests+buffSlot));
if (sub->reg) {
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
void *sendbuff;
recvParts.mhandle = sub->recvMhandle;
recvParts.address = sub->recvbuff;
recvParts.size = nBytes;
if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
sendbuff = sub->sendbuff + sub->offset % sizePerRank;
} else {
sendbuff = sub->sendbuff;
}
NCCLCHECK(proxyState->ncclCollNet->iallgather(
resources->collNetComm, sendbuff, 1, &recvParts,
sizePerRank, sub->offset, nBytes,
sub->sendMhandle, sub->requests + buffSlot));
if (sub->requests[buffSlot]) {
sub->recvbuff += nBytes;
sub->nbytes -= nBytes;
sub->offset += nBytes;
}
} else {
recvParts.mhandle = recvMhandle;
recvParts.address = region + recvBeg;
recvParts.size = allEnd - allBeg;
NCCLCHECK(proxyState->ncclCollNet->iallgather(
resources->collNetComm, region + sendBeg, 1, &recvParts,
sizePerRank, allBeg, allEnd - allBeg,
sendMhandle, sub->requests + buffSlot));
}
} else {
ncclNetSGE_v8_t sendParts;
sendParts.mhandle = sendMhandle;
sendParts.address = region + sendBeg;
sendParts.size = allEnd - allBeg;
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
resources->collNetComm, 1, &sendParts, region+recvBeg,
sizePerRank, allBeg, allEnd-allBeg,
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
recvMhandle, sub->requests+buffSlot));
if (sub->reg) {
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
void *recvbuff;
sendParts.mhandle = sub->sendMhandle;
sendParts.address = sub->sendbuff;
sendParts.size = nBytes;
if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
recvbuff = sub->recvbuff + sub->offset % sizePerRank;
} else {
recvbuff = sub->recvbuff;
}
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
resources->collNetComm, 1, &sendParts, recvbuff,
sizePerRank, sub->offset, nBytes,
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
sub->recvMhandle, sub->requests + buffSlot));
if (sub->requests[buffSlot]) {
sub->sendbuff += nBytes;
sub->nbytes -= nBytes;
sub->offset += nBytes;
}
} else {
sendParts.mhandle = sendMhandle;
sendParts.address = region + sendBeg;
sendParts.size = allEnd - allBeg;
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
resources->collNetComm, 1, &sendParts, region + recvBeg,
sizePerRank, allBeg, allEnd - allBeg,
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
recvMhandle, sub->requests + buffSlot));
}
}
}
if (sub->requests[buffSlot] == nullptr) continue;
@@ -854,7 +915,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
int totalSize = recvEnd - recvBeg;
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
sub->received += args->sliceSteps;
if (reqFifo[group][buffSlot].size > 0 && resources->useGdr && resources->needFlush) {
if ((reqFifo[group][buffSlot].size > 0 || sub->reg) && resources->useGdr && resources->needFlush) {
// GDRCOPY support
if (resources->gdcFlush) {
#if defined (__x86_64__)
@@ -865,7 +926,37 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
return ncclInternalError;
#endif
} else {
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
if (sub->reg) {
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
size_t offset = 0;
if (args->coll == ncclFuncReduceScatter) {
size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
int node = args->specifics.collnetDirect.node;
int startNode = sub->offset / sizePerRank;
int lastNode = (sub->offset + nBytes) / sizePerRank;
if (startNode == node) {
offset = sub->offset % sizePerRank;
nBytes = std::min(sizePerRank - offset, nBytes);
} else if (startNode < node && node < lastNode) {
nBytes = sizePerRank;
} else if (node == lastNode) {
nBytes = (sub->offset + nBytes) % sizePerRank;
} else {
// no need to flush
nBytes = 0;
}
}
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset, nBytes, sub->recvMhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot]) {
sub->nbytes -= nBytes;
sub->offset += nBytes;
if (args->coll == ncclFuncAllGather || args->coll == ncclFuncAllReduce) {
sub->recvbuff += nBytes;
}
}
} else {
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
}
}
}
args->idle = 0;
@@ -886,10 +977,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
}
}
if (sub->transmitted < sub->flushed) {
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
__sync_synchronize();
if (sub->reg == 0) {
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
__sync_synchronize();
}
volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
*recvTail = sub->base + sub->flushed;
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
@@ -916,9 +1009,134 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
return ncclSuccess;
}
struct collnetRegInfo {
uintptr_t buffer;
size_t size;
};
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
ncclResult_t ret = ncclSuccess;
struct ncclReg *regRecord = NULL;
*outRegBufFlag = 0;
*outHandle = NULL;
if (comm && userbuff && buffSize > 0) {
NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
if (regRecord) {
if (regRecord->state & COLLNET_REG_COMPLETE) {
// reuse previous registration
*outRegBufFlag = 2;
*outHandle = regRecord->collnetHandle;
goto exit;
} else {
/* start register collnet buffer */
struct collnetRegInfo info = {regRecord->addr, regRecord->pages * comm->regCache.pageSize};
void* handle = NULL;
struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
if (handle) {
regRecord->state |= COLLNET_REG_COMPLETE;
regRecord->proxyconn = proxyconn;
*outHandle = regRecord->collnetHandle = handle;
*outRegBufFlag = 1;
}
}
}
}
exit:
return ret;
fail:
*outRegBufFlag = 0;
*outHandle = NULL;
goto exit;
}
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
ncclResult_t ret = ncclSuccess;
void* handle = NULL;
struct ncclRegCache* cache = &comm->regCache;
uintptr_t pageSize = cache->pageSize;
uintptr_t addr = (uintptr_t)userbuff & -pageSize;
size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize;
collnetRegInfo info = {addr, size};
struct ncclCollnetHandleList* record = NULL;
struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
*outRegBufFlag = 0;
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
record = ncclMemoryPoolAlloc<struct ncclCollnetHandleList>(&comm->memPool_ncclCollnetHandleList, &comm->memPermanent);
record->proxyconn = proxyConn;
record->buffer = userbuff;
record->size = buffSize;
*outHandle = record->collnetHandle = handle;
*outRegBufFlag = 1;
ncclIntruQueueEnqueue(&plan->collnetHandleQueue, record);
exit:
return ret;
fail:
*outRegBufFlag = 0;
*outHandle = NULL;
goto exit;
}
ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle) {
NCCLCHECK(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
return ncclSuccess;
}
static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
void* handle;
struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
assert(reqSize == sizeof(struct collnetRegInfo));
assert(respSize == sizeof(void*));
if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
memcpy(respBuff, (void*)&handle, sizeof(void*));
*done = 1;
return ncclSuccess;
}
static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
void* handle;
struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
assert(reqSize == sizeof(struct collnetRegInfo));
assert(respSize == sizeof(void*));
if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
memcpy(respBuff, (void*)&handle, sizeof(void*));
*done = 1;
return ncclSuccess;
}
static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
void* handle;
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
assert(reqSize == sizeof(void*));
memcpy(&handle, reqBuff, sizeof(void*));
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, handle));
*done = 1;
return ncclSuccess;
}
static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
void* handle;
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
assert(reqSize == sizeof(void*));
memcpy(&handle, reqBuff, sizeof(void*));
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, handle));
*done = 1;
return ncclSuccess;
}
struct ncclTransport collNetTransport = {
"COL",
canConnect,
{ sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
{ recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
{ sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
{ recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
};
+17 -13
Parādīt failu
@@ -179,8 +179,9 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
req.connIndex = connIndex;
int proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
tpProxyRank = comm->topParentRanks[proxyRank];
@@ -216,8 +217,9 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
// Use myInfo->rank as the receiver uses its own NIC
int proxyRank, tpProxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
// Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
@@ -347,6 +349,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
send->conn.tail = &recvMem->tail;
send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
send->conn.connFifo = recvMem->connFifo;
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
for (int i=0; i<NCCL_STEPS; i++) {
@@ -412,6 +415,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
recv->conn.connFifo = recvMem->connFifo;
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
for (int i=0; i<NCCL_STEPS; i++) {
@@ -1035,7 +1039,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
sub->posted = sub->transmitted = sub->done = 0;
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
if (sub->reg && sub->nbytes > 0) {
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
} else {
sub->mhandle = resources->mhandles[args->protocol];
}
@@ -1110,7 +1114,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
}
} else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
buff = sub->reg ? (char*)sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset;
buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
}
if (ready) {
// Data is ready, try to send.
@@ -1134,7 +1138,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
if (done) {
if (sub->reg) {
if (size < sub->nbytes) {
sub->buffer = ((char*)sub->buffer)+size;
sub->recvbuff += size;
sub->nbytes -= size;
// Do one more step (at least)
sub->nsteps++;
@@ -1215,7 +1219,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
if (sub->reg && sub->nbytes > 0) {
// Register buffer
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
} else {
sub->mhandle = resources->mhandles[args->protocol];
}
@@ -1247,7 +1251,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
if (sub->reg) {
// Wait until CUDA kernel has started before we access the user buffer directly.
if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
ptrs[subCount] = sub->buffer;
ptrs[subCount] = sub->recvbuff;
sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
} else {
int sharedBuffSlot = sub->posted%maxDepth;
@@ -1307,7 +1311,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
int size = sizes[subIndex++];
if (sub->reg) {
if (size < sub->nbytes) {
sub->buffer = ((char*)sub->buffer) + size;
sub->recvbuff += size;
sub->nbytes -= size;
// Do one more step (at least)
sub->nsteps++;
@@ -1349,7 +1353,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
ptrs[subCount] = resources->shared ?
(sub->reg ? sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
(sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
localBuff+buffSlot*stepSize;
mhandles[subCount] = sub->mhandle;
subCount++;
@@ -1439,6 +1443,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
struct ncclTransport netTransport = {
"NET",
canConnect,
{ sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
{ recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
{ sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, NULL },
{ recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, NULL }
};
+233 -20
Parādīt failu
@@ -77,7 +77,8 @@ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
static int ncclIbRelaxedOrderingEnabled = 0;
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
@@ -103,6 +104,210 @@ static void* ncclIbAsyncThreadMain(void* args) {
return NULL;
}
static sa_family_t envIbAddrFamily(void) {
sa_family_t family = AF_INET;
const char* env = ncclGetEnv("NCCL_IB_ADDR_FAMILY");
if (env == NULL || strlen(env) == 0) {
return family;
}
INFO(NCCL_ENV, "NCCL_IB_ADDR_FAMILY set by environment to %s", env);
if (strcmp(env, "AF_INET") == 0) {
family = AF_INET;
} else if (strcmp(env, "AF_INET6") == 0) {
family = AF_INET6;
}
return family;
}
static void* envIbAddrRange(sa_family_t af, int* mask) {
*mask = 0;
static struct in_addr addr;
static struct in6_addr addr6;
void *ret = (af == AF_INET) ? (void *)&addr : (void *)&addr6;
const char* env = ncclGetEnv("NCCL_IB_ADDR_RANGE");
if (NULL == env || strlen(env) == 0) {
return NULL;
}
INFO(NCCL_ENV, "NCCL_IB_ADDR_RANGE set by environment to %s", env);
char addrString[128] = { 0 };
snprintf(addrString, 128, "%s", env);
char *addrStrPtr = addrString;
char *maskStrPtr = strstr(addrString, "/") + 1;
if (NULL == maskStrPtr) {
return NULL;
}
*(maskStrPtr - 1) = '\0';
if (inet_pton(af, addrStrPtr, ret) == 0) {
WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
return NULL;
}
*mask = (int)strtol(maskStrPtr, NULL, 10);
if (af == AF_INET && *mask > 32) {
WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
*mask = 0;
ret = NULL;
} else if (af == AF_INET6 && *mask > 128) {
WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
*mask = 0;
ret = NULL;
}
return ret;
}
static sa_family_t getGidAddrFamily(union ibv_gid* gid) {
const struct in6_addr *a = (struct in6_addr *)gid->raw;
bool isIpV4Mapped = ((a->s6_addr32[0] | a->s6_addr32[1]) | (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL;
bool isIpV4MappedMulticast = (a->s6_addr32[0] == htonl(0xff0e0000) && ((a->s6_addr32[1] | (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL));
return (isIpV4Mapped || isIpV4MappedMulticast) ? AF_INET : AF_INET6;
}
static bool matchGidAddrPrefix(sa_family_t af, void* prefix, int prefixlen, union ibv_gid* gid) {
struct in_addr *base = NULL;
struct in6_addr *base6 = NULL;
struct in6_addr *addr6 = NULL;;
if (af == AF_INET) {
base = (struct in_addr *)prefix;
} else {
base6 = (struct in6_addr *)prefix;
}
addr6 = (struct in6_addr *)gid->raw;
#define NETMASK(bits) (htonl(0xffffffff ^ ((1 << (32 - bits)) - 1)))
int i = 0;
while (prefixlen > 0 && i < 4) {
if (af == AF_INET) {
int mask = NETMASK(prefixlen);
if ((base->s_addr & mask) ^ (addr6->s6_addr32[3] & mask)) {
break;
}
prefixlen = 0;
break;
} else {
if (prefixlen >= 32) {
if (base6->s6_addr32[i] ^ addr6->s6_addr32[i]) {
break;
}
prefixlen -= 32;
++i;
} else {
int mask = NETMASK(prefixlen);
if ((base6->s6_addr32[i] & mask) ^ (addr6->s6_addr32[i] & mask)) {
break;
}
prefixlen = 0;
}
}
}
return (prefixlen == 0) ? true : false;
}
static bool configuredGid(union ibv_gid* gid) {
const struct in6_addr *a = (struct in6_addr *)gid->raw;
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
if (((a->s6_addr32[0] | trailer) == 0UL) || ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
return false;
}
return true;
}
static bool linkLocalGid(union ibv_gid* gid) {
const struct in6_addr *a = (struct in6_addr *)gid->raw;
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
return true;
}
return false;
}
static bool validGid(union ibv_gid* gid) {
return (configuredGid(gid) && !linkLocalGid(gid));
}
static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, int gidIndex, int* version) {
char gidRoceVerStr[16] = { 0 };
char roceTypePath[PATH_MAX] = { 0 };
sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex);
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1) {
return ncclSystemError;
}
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1) {
return ncclSystemError;
}
if (strlen(gidRoceVerStr)) {
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0 || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
*version = 1;
} else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
*version = 2;
}
}
return ncclSuccess;
}
static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t portNum, sa_family_t af, void* prefix, int prefixlen, int roceVer, int gidIndexCandidate, int* gidIndex) {
union ibv_gid gid, gidCandidate;
NCCLCHECK(wrap_ibv_query_gid(context, portNum, *gidIndex, &gid));
NCCLCHECK(wrap_ibv_query_gid(context, portNum, gidIndexCandidate, &gidCandidate));
sa_family_t usrFam = af;
sa_family_t gidFam = getGidAddrFamily(&gid);
sa_family_t gidCandidateFam = getGidAddrFamily(&gidCandidate);
bool gidCandidateMatchSubnet = matchGidAddrPrefix(usrFam, prefix, prefixlen, &gidCandidate);
if (gidCandidateFam != gidFam && gidCandidateFam == usrFam && gidCandidateMatchSubnet) {
*gidIndex = gidIndexCandidate;
} else {
if (gidCandidateFam != usrFam || !validGid(&gidCandidate) || !gidCandidateMatchSubnet) {
return ncclSuccess;
}
int usrRoceVer = roceVer;
int gidRoceVerNum, gidRoceVerNumCandidate;
const char* deviceName = wrap_ibv_get_device_name(context->device);
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate));
if ((gidRoceVerNum != gidRoceVerNumCandidate || !validGid(&gid)) && gidRoceVerNumCandidate == usrRoceVer) {
*gidIndex = gidIndexCandidate;
}
}
return ncclSuccess;
}
static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portNum, int gidTblLen, int *gidIndex) {
*gidIndex = ncclParamIbGidIndex();
if (*gidIndex >= 0) {
return ncclSuccess;
}
sa_family_t userAddrFamily = envIbAddrFamily();
int userRoceVersion = ncclParamIbRoceVersionNum();
int prefixlen;
void *prefix = envIbAddrRange(userAddrFamily, &prefixlen);
*gidIndex = 0;
for (int gidIndexNext = 1; gidIndexNext < gidTblLen; ++gidIndexNext) {
NCCLCHECK(ncclUpdateGidIndex(context, portNum, userAddrFamily, prefix, prefixlen, userRoceVersion, gidIndexNext, gidIndex));
}
return ncclSuccess;
}
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1);
@@ -182,6 +387,7 @@ int ncclIbFindMatchingDev(int dev) {
}
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclResult_t ret;
if (ncclParamIbDisable()) return ncclInternalError;
static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
@@ -194,7 +400,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclNMergedIbDevs = 0;
if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
WARN("NET/IB : No IP interface found.");
return ncclInternalError;
ret = ncclInternalError;
goto fail;
}
// Detect IB cards
@@ -211,7 +418,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
if (searchExact) userIbEnv++;
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
struct ibv_context * context;
@@ -224,7 +431,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
memset(&devAttr, 0, sizeof(devAttr));
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
WARN("NET/IB : Unable to query device %s", devices[d]->name);
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
if (ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
continue;
}
for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
@@ -244,6 +451,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
ncclIbDevs[ncclNIbDevs].device = d;
ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
ncclIbDevs[ncclNIbDevs].portNum = port_num;
ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
@@ -295,9 +503,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclNIbDevs++;
nPorts++;
}
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
}
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
}
if (ncclNIbDevs == 0) {
INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
@@ -333,6 +541,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
pthread_mutex_unlock(&ncclIbLock);
}
return ncclSuccess;
fail:
pthread_mutex_unlock(&ncclIbLock);
return ret;
}
ncclResult_t ncclIbDevices(int* ndev) {
@@ -484,6 +695,7 @@ struct ncclIbHandle {
struct ncclIbGidInfo {
uint8_t link_layer;
union ibv_gid localGid;
int32_t localGidIndex;
};
#define NCCL_NET_IB_REQ_UNUSED 0
@@ -516,7 +728,7 @@ struct ncclIbNetCommDevBase {
int ibDevN;
struct ibv_pd* pd;
struct ibv_cq* cq;
uint64_t pad[1];
uint64_t pad[2];
struct ncclIbGidInfo gidInfo;
};
@@ -698,7 +910,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
return ncclSuccess;
}
ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t dest_qp_num, struct ncclIbDevInfo* info) {
ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint8_t sGidIndex, uint32_t dest_qp_num, struct ncclIbDevInfo* info) {
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_RTR;
@@ -712,7 +924,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t dest_qp_num, struct ncclIbD
qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
qpAttr.ah_attr.grh.flow_label = 0;
qpAttr.ah_attr.grh.sgid_index = ncclParamIbGidIndex();
qpAttr.ah_attr.grh.sgid_index = sGidIndex;
qpAttr.ah_attr.grh.hop_limit = 255;
qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
} else {
@@ -818,9 +1030,6 @@ ib_connect_check:
for (int i = 0; i < comm->base.ndevs; i++) {
ncclIbSendCommDev* commDev = comm->devs + i;
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
// Send my QP Info to receiver through the socket. Hope this won't block.
// TODO - I thought I queried this in init?
NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));
// Write to the metadata struct via this pointer
ncclIbDevInfo* devInfo = meta.devs + i;
@@ -835,7 +1044,8 @@ ib_connect_check:
// RoCE support
devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
if (devInfo->link_layer == IBV_LINK_LAYER_ETHERNET) {
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &commDev->base.gidInfo.localGid));
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &commDev->base.gidInfo.localGidIndex));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
devInfo->spn = commDev->base.gidInfo.localGid.global.subnet_prefix;
devInfo->iid = commDev->base.gidInfo.localGid.global.interface_id;
}
@@ -854,7 +1064,7 @@ ib_connect_check:
if (comm->base.qps[q].devIndex == i)
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(),
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex,
devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey);
}
}
@@ -923,12 +1133,15 @@ ib_connect:
// Assign per-QP remDev
comm->base.qps[q].remDevIdx = remQpInfo->devIndex;
int devIndex = comm->base.qps[q].devIndex;
ncclIbSendCommDev* commDev = comm->devs + devIndex;
uint8_t gidIndex = commDev->base.gidInfo.localGidIndex;
struct ibv_qp* qp = comm->base.qps[q].qp;
if (remQpInfo->ece_supported && remQpInfo->ece_supported)
NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo->qpn, remDevInfo));
NCCLCHECK(ncclIbRtrQp(qp, gidIndex, remQpInfo->qpn, remDevInfo));
NCCLCHECK(ncclIbRtsQp(qp));
}
@@ -1024,8 +1237,8 @@ ib_recv:
ibDevN = mergedDev->devs[i];
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
ibDev = ncclIbDevs + ibDevN;
NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &rCommDev->base.gidInfo.localGid));
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &rCommDev->base.gidInfo.localGidIndex));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid));
}
// Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
@@ -1064,7 +1277,7 @@ ib_recv:
NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
}
NCCLCHECK(ncclIbRtrQp(qp->qp, remMeta.qpInfo[q].qpn, remDevInfo));
NCCLCHECK(ncclIbRtrQp(qp->qp, rCommDev->base.gidInfo.localGidIndex, remMeta.qpInfo[q].qpn, remDevInfo));
NCCLCHECK(ncclIbRtsQp(qp->qp));
}
@@ -1097,7 +1310,7 @@ ib_recv:
devInfo.spn = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
devInfo.iid = rCommDev->base.gidInfo.localGid.global.interface_id;
devInfo.mtu = ibDev->portAttr.active_mtu;
NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo));
NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->base.gidInfo.localGidIndex, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo));
NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
}
@@ -1724,7 +1937,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
return ncclInternalError;
}
if (req->nreqs == 1) {
req->recv.sizes[0] += wc->imm_data;
req->recv.sizes[0] = wc->imm_data;
}
}
req->events[i]--;
+31 -16
Parādīt failu
@@ -46,12 +46,12 @@ struct ncclTransport nvlsTransport = {
{ NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
};
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) {
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, size_t size) {
CUmulticastObjectProp* prop = &resources->properties;
memset(prop, 0, sizeof(*prop));
prop->size = size;
prop->numDevices = nranks;
prop->handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
prop->numDevices = comm->MNNVL ? comm->clique.size : comm->localRanks;
prop->handleTypes = ncclCuMemHandleType;
prop->flags = 0;
// Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved
@@ -70,6 +70,7 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes*
}
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) {
CUmemAllocationHandleType type = ncclCuMemHandleType;
size_t size = prop->size;
// Create a Multicast group
@@ -77,9 +78,9 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
CUCHECK(cuMulticastCreate(mcHandle, prop));
if ((NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) && (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)) {
if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
// Get a handle to pass to other ranks
CUCHECK(cuMemExportToShareableHandle(shareableHandle, *mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
CUCHECK(cuMemExportToShareableHandle(shareableHandle, *mcHandle, ncclCuMemHandleType, 0));
}
else {
memcpy(shareableHandle, mcHandle, sizeof(CUmemGenericAllocationHandle));
@@ -97,7 +98,7 @@ ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes*
}
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
CUmemAllocationHandleType type = ncclCuMemHandleType;
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
@@ -113,7 +114,7 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
(void) close(fd);
} else {
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type));
} else {
memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle));
@@ -136,7 +137,7 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* r
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = resources->dev;
prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
prop.requestedHandleTypes = ncclCuMemHandleType;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
resources->ucGran = granularity;
@@ -229,6 +230,7 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes*
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024);
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
comm->nvlsSupport = 0;
@@ -236,8 +238,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
int gpuCount;
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
// NVLS is not supported on MNNVL yet
if (!ncclParamNvlsEnable() || gpuCount <= 2 || comm->nNodes > 1 || comm->MNNVL) return ncclSuccess;
if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess;
CUdevice dev;
int driverVersion;
@@ -306,7 +307,8 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
NCCLCHECK(initNvlsChannel(comm, c, parent, false));
}
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
int nvlsStepSize = comm->nvlsChunkSize = ncclParamNvlsChunkSize();
size_t buffSize = nvlsStepSize * NCCL_STEPS;
size_t memSize = NVLS_MEM_ALIGN_SIZE;
size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize);
size_t nvlsTotalSize = nvlsPerRankSize * nHeads;
@@ -315,7 +317,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
char* shareableHandle = resources->shareableHandle;
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup);
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nvlsTotalSize), res, cleanup);
if (comm->localRank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, &resources->properties, comm->localRank, comm->localRanks, &resources->mcHandle, shareableHandle), res, cleanup);
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
@@ -326,8 +328,14 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
if (comm->localRanks > 1) {
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
}
if (comm->MNNVL) {
// MNNVL: Clique wide barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, comm->clique.ranks[0]), res, cleanup);
}
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
for (int h = 0; h < nHeads; h++) {
@@ -343,11 +351,13 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[1].conn.head = (uint64_t*)(mem + buffSize);
peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->send[1].conn.stepSize = nvlsStepSize;
mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
peer->recv[0].transportComm = &nvlsTransport.recv;
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[0].conn.head = (uint64_t*)(mem + buffSize);
peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->recv[0].conn.stepSize = nvlsStepSize;
peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL;
// Broadcast MC -> UC
@@ -356,11 +366,13 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[1].conn.head = (uint64_t*)(mem + buffSize);
peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->recv[1].conn.stepSize = nvlsStepSize;
mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
peer->send[0].transportComm = &nvlsTransport.send;
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[0].conn.head = (uint64_t*)(mem + buffSize);
peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
peer->send[0].conn.stepSize = nvlsStepSize;
peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
@@ -378,6 +390,9 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
}
}
// MNNVL does not support NVLS buffer registration
if (comm->MNNVL) return res;
/* create shared memory for fast NVLS buffer registration */
typeSize = sizeof(struct localRegData) << 1;
@@ -595,7 +610,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
localRegBufUsed = true;
INFO(NCCL_NVLS, "rank %d reuse local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
INFO(NCCL_NVLS, "rank %d reuse local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
goto exit;
}
@@ -611,7 +626,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
if (localRegBufUsed == false) goto fail;
}
INFO(NCCL_NVLS, "rank %d successfully local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
INFO(NCCL_NVLS, "rank %d successfully local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
exit:
*outRegBufSend = (void*)regSendPtr;
+8 -3
Parādīt failu
@@ -99,12 +99,15 @@ NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
static int useMemcpy = 0;
static void initCeOperation();
extern int64_t ncclParamMNNVLEnable();
/* Determine if two peers can communicate through p2p */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
initCeOperation();
// MNNVL support
if (info1->hostHash != info2->hostHash) {
if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) {
NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
if (*ret) return ncclSuccess;
}
@@ -467,6 +470,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
buff += comm->buffSizes[p];
}
}
send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (useMemcpy) {
send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
@@ -512,6 +516,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
recv->conn.ptrExchange = &remDevMem->ptrExchange;
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
}
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
char* buff = (char*)(resources->recvDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -749,8 +754,8 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
struct ncclTransport p2pTransport = {
"P2P",
p2pCanConnect,
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL }
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL }
};
static void initCeOperation() {
+8 -2
Parādīt failu
@@ -150,6 +150,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
}
send->conn.tail = &resources->devRemHostMem->tail;
send->conn.head = &resources->devHostMem->head;
send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (useMemcpyRecv) {
send->conn.connFifo = resources->devRemHostMem->connFifo;
@@ -189,6 +190,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
}
recv->conn.head = &resources->devRemHostMem->head;
recv->conn.tail = &resources->devHostMem->tail;
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
@@ -210,6 +212,7 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
free(resources);
send->transportResources = NULL;
}
return ncclSuccess;
}
@@ -220,6 +223,7 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
free(resources);
recv->transportResources = NULL;
}
return ncclSuccess;
}
@@ -271,6 +275,7 @@ static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, str
CUDACHECK(cudaEventDestroy(resources->events[i]));
}
free(connection->transportResources);
connection->transportResources = NULL;
}
return ncclSuccess;
}
@@ -286,6 +291,7 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
CUDACHECK(cudaEventDestroy(resources->events[i]));
}
free(connection->transportResources);
connection->transportResources = NULL;
}
return ncclSuccess;
}
@@ -409,8 +415,8 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL }
};
static void initCeOperation() {