2020-02-03 22:06:44 +00:00
/*************************************************************************
2022-09-09 01:20:52 +00:00
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
2023-02-04 01:43:38 +00:00
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
2020-02-03 22:06:44 +00:00
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "channel.h"
#include "nvmlwrap.h"
#include "bootstrap.h"
#include "transport.h"
#include "group.h"
#include "net.h"
#include "graph.h"
#include "argcheck.h"
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <hip/hip_runtime.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <dlfcn.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
2023-06-21 20:54:24 -07:00
#include <cstdarg>
2020-03-03 11:42:40 -08:00
#include "xml.h"
#include "coll_net.h"
2020-02-03 22:06:44 +00:00
#include "model.h"
#include "utils.h"
2021-07-27 08:30:08 -07:00
#include "rocm_smi/rocm_smi.h"
2020-02-03 22:06:44 +00:00
2022-02-21 13:09:47 +08:00
const char * ncclFuncStr [ NCCL_NUM_FUNCTIONS + 2 ] = { "Broadcast" , "Reduce" , "AllGather" , "ReduceScatter" , "AllReduce" , "SendRecv" , "AllToAllPivot" };
2022-11-04 22:54:29 +00:00
const char * ncclAlgoStr [ NCCL_NUM_ALGORITHMS ] = { "Tree" , "Ring" , "CollNetDirect" , "CollNetChain" };
2020-04-02 18:01:21 -07:00
const char * ncclProtoStr [ NCCL_NUM_PROTOCOLS ] = { "LL" , "LL128" , "Simple" };
2020-02-03 22:06:44 +00:00
extern NodeModel * node_model ;
2023-06-21 20:54:24 -07:00
RCCL_PARAM ( CliqueIgnoreTopo , "CLIQUE_IGNORE_TOPO" , 0 );
RCCL_PARAM ( P2pNetDisable , "P2P_NET_DISABLE" , 0 );
RCCL_PARAM ( PivotAlltoallEnable , "PIVOT_ALLTOALL_ENABLE" , 1 );
RCCL_PARAM ( LL128ForceEnable , "LL128_FORCE_ENABLE" , 0 );
2020-03-03 11:42:40 -08:00
NCCL_PARAM ( GraphDumpFileRank , "GRAPH_DUMP_FILE_RANK" , 0 );
2023-06-21 20:54:24 -07:00
NCCL_PARAM ( CollNetNodeThreshold , "COLLNET_NODE_THRESHOLD" , 2 );
NCCL_PARAM ( NvbPreconnect , "NVB_PRECONNECT" , 0 );
NCCL_PARAM ( AllocP2pNetLLBuffers , "ALLOC_P2P_NET_LL_BUFFERS" , 0 );
2020-03-03 11:42:40 -08:00
thread_local int ncclDebugNoWarn = 0 ;
2021-04-30 16:57:36 -07:00
ncclCollNet_t * ncclCollNet = NULL ;
2020-02-03 22:06:44 +00:00
// Get current Compute Capability
int ncclCudaCompCap () {
int ccMajor = 1 , ccMinor = 0 ;
return ccMajor * 10 + ccMinor ;
}
ncclResult_t int64ToBusId ( int64_t id , char * busId ) {
sprintf ( busId , "%04lx:%02lx:%02lx.%01lx" , ( id ) >> 20 , ( id & 0xff000 ) >> 12 , ( id & 0xff0 ) >> 4 , ( id & 0xf ));
return ncclSuccess ;
}
2020-03-03 11:42:40 -08:00
ncclResult_t busIdToInt64 ( const char * busId , int64_t * id ) {
2020-02-03 22:06:44 +00:00
const int size = strlen ( busId );
char * hexStr ;
NCCLCHECK ( ncclCalloc ( & hexStr , size ));
int hexOffset = 0 ;
for ( int i = 0 ; i < size ; i ++ ) {
char c = busId [ i ];
if ( c == '.' || c == ':' ) continue ;
if (( c >= '0' && c <= '9' ) ||
( c >= 'A' && c <= 'F' ) ||
( c >= 'a' && c <= 'f' )) {
hexStr [ hexOffset ++ ] = busId [ i ];
} else break ;
}
hexStr [ hexOffset ] = '\0' ;
* id = strtol ( hexStr , NULL , 16 );
free ( hexStr );
return ncclSuccess ;
}
2023-06-21 20:54:24 -07:00
void * ncclMemoryStack :: allocateSpilled ( struct ncclMemoryStack * me , size_t size , size_t align ) {
// `me->hunks` points to the top of the stack non-empty hunks. Hunks above
// this (reachable via `->above`) are empty.
struct Hunk * top = me -> topFrame . hunk ;
size_t mallocSize = 0 ;
// If we have lots of space left in hunk but that wasn't enough then we'll
// allocate the object unhunked.
if ( me -> topFrame . end - me -> topFrame . bumper >= 8 << 10 )
goto unhunked ;
// If we have another hunk (which must be empty) waiting above this one and
// the object fits then use that.
if ( top && top -> above ) {
struct Hunk * top1 = top -> above ;
uintptr_t uobj = ( reinterpret_cast < uintptr_t > ( top1 ) + sizeof ( struct Hunk ) + align - 1 ) & - uintptr_t ( align );
if ( uobj + size <= reinterpret_cast < uintptr_t > ( top1 ) + top1 -> size ) {
me -> topFrame . hunk = top1 ;
me -> topFrame . bumper = uobj + size ;
me -> topFrame . end = reinterpret_cast < uintptr_t > ( top1 ) + top1 -> size ;
return reinterpret_cast < void *> ( uobj );
}
}
{ // If the next hunk we're going to allocate wouldn't be big enough but the
// Unhunk proxy fits in the current hunk then go allocate as unhunked.
size_t nextSize = ( top ? top -> size : 0 ) + ( 64 << 10 );
constexpr size_t maxAlign = 64 ;
if ( nextSize < sizeof ( struct Hunk ) + maxAlign + size ) {
uintptr_t uproxy = ( me -> topFrame . bumper + alignof ( Unhunk ) - 1 ) & - uintptr_t ( alignof ( Unhunk ));
if ( uproxy + sizeof ( struct Unhunk ) <= me -> topFrame . end )
goto unhunked ;
}
// At this point we must need another hunk, either to fit the object
// itself or its Unhunk proxy.
mallocSize = nextSize ;
INFO ( NCCL_ALLOC , "%s:%d memory stack hunk malloc(%llu)" , __FILE__ , __LINE__ , ( unsigned long long ) mallocSize );
struct Hunk * top1 = ( struct Hunk * ) malloc ( mallocSize );
if ( top1 == nullptr ) goto malloc_exhausted ;
top1 -> size = nextSize ;
top1 -> above = nullptr ;
if ( top ) top -> above = top1 ;
top = top1 ;
me -> topFrame . hunk = top ;
me -> topFrame . end = reinterpret_cast < uintptr_t > ( top ) + nextSize ;
me -> topFrame . bumper = reinterpret_cast < uintptr_t > ( top ) + sizeof ( struct Hunk );
}
{ // Try to fit object in the new top hunk.
uintptr_t uobj = ( me -> topFrame . bumper + align - 1 ) & - uintptr_t ( align );
if ( uobj + size <= me -> topFrame . end ) {
me -> topFrame . bumper = uobj + size ;
return reinterpret_cast < void *> ( uobj );
}
}
unhunked :
{ // We need to allocate the object out-of-band and put an Unhunk proxy in-band
// to keep track of it.
uintptr_t uproxy = ( me -> topFrame . bumper + alignof ( Unhunk ) - 1 ) & - uintptr_t ( alignof ( Unhunk ));
Unhunk * proxy = reinterpret_cast < Unhunk *> ( uproxy );
me -> topFrame . bumper = uproxy + sizeof ( Unhunk );
proxy -> next = me -> topFrame . unhunks ;
me -> topFrame . unhunks = proxy ;
mallocSize = size ;
proxy -> obj = malloc ( mallocSize );
INFO ( NCCL_ALLOC , "%s:%d memory stack non-hunk malloc(%llu)" , __FILE__ , __LINE__ , ( unsigned long long ) mallocSize );
if ( proxy -> obj == nullptr ) goto malloc_exhausted ;
return proxy -> obj ;
}
malloc_exhausted :
WARN ( "%s:%d Unrecoverable error detected: malloc(size=%llu) returned null." , __FILE__ , __LINE__ , ( unsigned long long ) mallocSize );
abort ();
}
void ncclMemoryStackDestruct ( struct ncclMemoryStack * me ) {
// Free unhunks first because both the frames and unhunk proxies lie within the hunks.
struct ncclMemoryStack :: Frame * f = & me -> topFrame ;
while ( f != nullptr ) {
struct ncclMemoryStack :: Unhunk * u = f -> unhunks ;
while ( u != nullptr ) {
free ( u -> obj );
u = u -> next ;
}
f = f -> below ;
}
// Free hunks
struct ncclMemoryStack :: Hunk * h = me -> stub . above ;
while ( h != nullptr ) {
struct ncclMemoryStack :: Hunk * h1 = h -> above ;
free ( h );
h = h1 ;
}
}
2020-02-03 22:06:44 +00:00
int ncclDebugLevel = - 1 ;
void ncclDebugInit () {
if ( ncclDebugLevel != - 1 ) return ;
const char * nccl_debug = getenv ( "NCCL_DEBUG" );
if ( nccl_debug == NULL ) {
2021-04-30 16:57:36 -07:00
ncclDebugLevel = NCCL_LOG_INFO ;
2020-02-03 22:06:44 +00:00
} else if ( strcasecmp ( nccl_debug , "VERSION" ) == 0 ) {
ncclDebugLevel = NCCL_LOG_VERSION ;
} else if ( strcasecmp ( nccl_debug , "WARN" ) == 0 ) {
ncclDebugLevel = NCCL_LOG_WARN ;
} else if ( strcasecmp ( nccl_debug , "INFO" ) == 0 ) {
ncclDebugLevel = NCCL_LOG_INFO ;
} else if ( strcasecmp ( nccl_debug , "ABORT" ) == 0 ) {
ncclDebugLevel = NCCL_LOG_ABORT ;
} else if ( strcasecmp ( nccl_debug , "TRACE" ) == 0 ) {
ncclDebugLevel = NCCL_LOG_TRACE ;
}
}
void ncclDebugLog ( ncclDebugLogLevel level , unsigned long flags , const char * filefunc , int line , const char * fmt , ...) {
if ( ncclDebugLevel == - 1 ) ncclDebugInit ();
if ( level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE ) return ;
2022-03-31 17:09:21 -07:00
if ( ncclDebugLevel < level || (( flags & ( NCCL_INIT | NCCL_GRAPH | NCCL_TUNING )) == 0 )) return ;
2021-04-30 16:57:36 -07:00
2020-02-03 22:06:44 +00:00
char buffer [ 1024 ];
2020-03-03 11:42:40 -08:00
size_t len = 0 ;
if ( node_model ) len = snprintf ( buffer , sizeof ( buffer ),
"[%d:%d] " , node_model -> nodeId , node_model -> currRank );
2020-02-03 22:06:44 +00:00
va_list args ;
va_start ( args , fmt );
vsprintf ( buffer + len , fmt , args );
va_end ( args );
printf ( "%s \n " , buffer );
2022-04-18 11:14:51 -07:00
#if 0
2020-02-03 22:06:44 +00:00
if (level == NCCL_LOG_WARN) {
fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n",
node_model->nodeId, node_model->currRank, filefunc, line);
abort();
}
2022-04-18 11:14:51 -07:00
#endif
2020-02-03 22:06:44 +00:00
}
2020-03-03 11:42:40 -08:00
ncclResult_t ncclTopoGetSystem ( const char * xmlTopoFile , struct ncclTopoSystem ** system ) {
struct ncclXml * xml ;
NCCLCHECK ( ncclCalloc ( & xml , 1 ));
2021-07-28 13:27:06 -07:00
NCCLCHECK ( ncclTopoGetXmlFromFile ( xmlTopoFile , xml , 0 ));
2020-03-03 11:42:40 -08:00
NCCLCHECK ( ncclTopoGetSystemFromXml ( xml , system ));
free ( xml );
return ncclSuccess ;
}
2023-06-21 20:54:24 -07:00
NCCL_PARAM ( CollNetEnable , "COLLNET_ENABLE" , 0 );
2020-03-03 11:42:40 -08:00
2021-07-28 13:27:06 -07:00
void initCollNet () {
if ( ncclParamCollNetEnable () == 1 && ncclCollNet == 0 )
ncclCollNet = ( ncclCollNet_t * ) 0x12345678 ;
}
2023-06-21 20:54:24 -07:00
ncclResult_t initChannel ( struct ncclComm * comm , int channelId ) {
struct ncclChannel * channel = & comm -> channels [ channelId ];
2021-07-28 13:27:06 -07:00
if ( channel -> id != - 1 ) return ncclSuccess ;
2023-06-21 20:54:24 -07:00
int nRanks = comm -> nRanks ;
int nPeers = nRanks + 1 /* Collnet */ + comm -> localRanks /* NVLS */ ;
channel -> id = channelId ;
channel -> workFifoSent = 0 ;
struct ncclSharedResources * sharedRes = comm -> sharedRes ;
//NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
if ( channel -> peers == NULL ) {
// The extra on nRanks+1 is for collnet root (i.e. network)
// Allocate everything related to sharedRes with ncclCalloc as this can be
// shared between communicators hence should not be tied to comm.
if ( sharedRes -> peers [ channelId ] == NULL ) {
NCCLCHECK ( ncclCalloc ( sharedRes -> peers + channelId , sharedRes -> tpNRanks ));
}
channel -> peers = ncclMemoryStackAlloc < struct ncclChannelPeer *> ( & comm -> memPermanent , nPeers );
for ( int r = 0 ; r < nRanks ; r ++ ) {
channel -> peers [ r ] = comm -> sharedRes -> peers [ channelId ] + comm -> topParentRanks [ r ];
ncclAtomicRefCountIncrement ( & channel -> peers [ r ] -> refCount );
}
}
#if 0
if (channel->devPeers == NULL) {
if (sharedRes->devPeers[channelId] == NULL) {
NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
}
/* channel->devPeers is not shared, so just free it when calling commFree() */
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devPeers);
for (int r = 0; r < nRanks; r++) {
uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
2021-07-28 13:27:06 -07:00
}
}
2023-06-21 20:54:24 -07:00
#endif
channel -> ring . userRanks = ncclMemoryStackAlloc < int > ( & comm -> memPermanent , nRanks );
//NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
//ncclCommPushCudaFree(comm, channel->devRingUserRanks);
2021-07-28 13:27:06 -07:00
2023-06-21 20:54:24 -07:00
//NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
//CUDACHECK(hipEventRecord(sharedRes->deviceStream.scratchEvent, sharedRes->deviceStream.cudaStream));
//CUDACHECK(hipStreamWaitEvent(sharedRes->deviceStream.cudaStream, sharedRes->deviceStream.scratchEvent, 0));
2021-07-28 13:27:06 -07:00
return ncclSuccess ;
}
2022-04-18 11:14:51 -07:00
ncclResult_t fillInfo ( struct ncclComm * comm , struct ncclPeerInfo * info , uint64_t commHash ) {
info -> rank = comm -> rank ;
info -> cudaDev = node_model -> rankToCudaDev ( comm -> rank );
info -> hostHash = node_model -> hostHash ;
info -> pidHash = node_model -> pidHash ;
// Get the device MAJOR:MINOR of /dev/shm so we can use that
// information to decide whether we can use SHM for inter-process
// communication in a container environment
//struct stat statbuf;
//SYSCHECK(stat("/dev/shm", &statbuf), "stat");
info -> shmDev = 0x19 ;
info -> busId = node_model -> getGpuBusId ( comm -> rank );
// detect if fine grained memory is available on this GPU
info -> hasFineGrain = true ;
info -> gdrSupport = 1 ;
info -> comm = comm ;
info -> cudaCompCap = 1 ;
return ncclSuccess ;
}
2021-07-28 13:27:06 -07:00
static ncclResult_t setupChannel ( struct ncclComm * comm , int channelId , int rank , int nranks , int * ringRanks ) {
TRACE ( NCCL_INIT , "rank %d nranks %d" , rank , nranks );
NCCLCHECK ( initChannel ( comm , channelId ));
struct ncclRing * ring = & comm -> channels [ channelId ]. ring ;
// Find our ring-distance from rank zero and reorganize ranks to start with rank.
int ixZero = 0 , ixRank = 0 ;
for ( int i = 0 ; i < nranks ; i ++ ) {
if ( ringRanks [ i ] == 0 ) ixZero = i ;
if ( ringRanks [ i ] == rank ) ixRank = i ;
}
ring -> index = ( ixRank - ixZero + nranks ) % nranks ;
for ( int i = 0 ; i < nranks ; i ++ ) {
ring -> userRanks [ i ] = ringRanks [( i + ixRank ) % nranks ];
}
return ncclSuccess ;
}
template < int type >
2022-01-13 13:33:07 -08:00
static ncclResult_t selectTransport ( struct ncclComm * comm , struct ncclTopoGraph * graph , struct ncclConnect * connect , int channelId , int peer , int connIndex , int * transportType ) {
2021-07-28 13:27:06 -07:00
struct ncclPeerInfo * myInfo = comm -> peerInfo + comm -> rank ;
struct ncclPeerInfo * peerInfo = comm -> peerInfo + peer ;
2023-06-21 20:54:24 -07:00
struct ncclConnector * connector = ( type == 1 ) ? comm -> channels [ channelId ]. peers [ peer ] -> send + connIndex :
comm -> channels [ channelId ]. peers [ peer ] -> recv + connIndex ;
2021-07-28 13:27:06 -07:00
// handle intra-node network connections
int n1 = - 1 , n2 = - 1 ;
if ( connIndex == NCCL_CONN_IDX_P2P_NET ) {
NCCLCHECK ( ncclTopoGetIntraNetDev ( comm -> topo , comm -> rank , graph , channelId , ( type == 1 ) ? 1 : 0 , & n1 ));
NCCLCHECK ( ncclTopoGetIntraNetDev ( comm -> topo , peer , graph , channelId , ( type == 1 ) ? 0 : 1 , & n2 ));
}
2022-01-13 13:33:07 -08:00
bool xgmi ;
NCCLCHECK ( ncclTopoGetLinkType ( comm -> topo , myInfo -> cudaDev , peerInfo -> cudaDev , & xgmi ));
2022-09-09 01:20:52 +00:00
2021-07-28 13:27:06 -07:00
for ( int t = 0 ; t < NTRANSPORTS ; t ++ ) {
if ( graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && ( t == TRANSPORT_SHM || ( ! xgmi && t == TRANSPORT_P2P ))) continue ;
if ( graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET ) continue ;
2022-09-09 01:20:52 +00:00
struct ncclTransport * transport = ncclTransports [ t ];
2021-07-28 13:27:06 -07:00
struct ncclTransportComm * transportComm = type == 1 ? & transport -> send : & transport -> recv ;
int ret = 0 ;
NCCLCHECK ( transport -> canConnect ( & ret , comm -> topo , graph , myInfo , peerInfo ));
if ( ret ) {
connector -> transportComm = transportComm ;
NCCLCHECK ( transportComm -> setup ( comm , graph , myInfo , peerInfo , connect , connector , channelId , connIndex ));
2022-01-13 13:33:07 -08:00
if ( transportType ) * transportType = t ;
2021-07-28 13:27:06 -07:00
return ncclSuccess ;
}
}
2022-04-18 11:14:51 -07:00
WARN ( "No transport found for rank %d[%lx] -> rank %d[%lx]" , myInfo -> rank , myInfo -> busId , peerInfo -> rank , peerInfo -> busId );
return ncclSystemError ;
2021-07-28 13:27:06 -07:00
}
2022-09-09 01:20:52 +00:00
ncclResult_t ncclTransportP2pConnect ( struct ncclComm * comm , int channelId , int nrecv , int * peerRecv , int nsend , int * peerSend , int connIndex ) {
2021-07-28 13:27:06 -07:00
TRACE ( NCCL_INIT , "nsend %d nrecv %d" , nsend , nrecv );
2022-09-09 01:20:52 +00:00
struct ncclChannel * channel = & comm -> channels [ channelId ];
2023-02-04 01:43:38 +00:00
uint64_t mask = 1UL << channel -> id ;
2021-07-28 13:27:06 -07:00
for ( int i = 0 ; i < nrecv ; i ++ ) {
int peer = peerRecv [ i ];
2023-06-21 20:54:24 -07:00
if ( peer == - 1 || peer >= comm -> nRanks || peer == comm -> rank || channel -> peers [ peer ] -> recv [ connIndex ]. connected ) continue ;
2022-09-09 01:20:52 +00:00
comm -> connectRecv [ peer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )] |= mask ;
2021-07-28 13:27:06 -07:00
}
for ( int i = 0 ; i < nsend ; i ++ ) {
int peer = peerSend [ i ];
2023-06-21 20:54:24 -07:00
if ( peer == - 1 || peer >= comm -> nRanks || peer == comm -> rank || channel -> peers [ peer ] -> send [ connIndex ]. connected ) continue ;
2022-09-09 01:20:52 +00:00
comm -> connectSend [ peer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )] |= mask ;
2021-07-28 13:27:06 -07:00
}
return ncclSuccess ;
}
2022-04-18 11:14:51 -07:00
void dumpData ( struct ncclConnect * data , int ndata ) {
for ( int n = 0 ; n < ndata ; n ++ ) {
printf ( "[%d] " , n );
uint8_t * d = ( uint8_t * ) data ;
for ( int i = 0 ; i < sizeof ( struct ncclConnect ); i ++ ) printf ( "%02x" , d [ i ]);
printf ( " \n " );
}
}
2022-01-13 13:33:07 -08:00
ncclResult_t ncclTransportP2pSetup ( struct ncclComm * comm , struct ncclTopoGraph * graph , int connIndex , int * highestTransportType /*=NULL*/ ) {
2021-07-28 13:27:06 -07:00
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
2023-02-04 01:43:38 +00:00
ncclResult_t ret = ncclSuccess ;
2022-09-09 01:20:52 +00:00
int highestType = TRANSPORT_P2P ; // track highest transport type
2023-06-21 20:54:24 -07:00
struct ncclConnect ** data = ( ncclConnect ** ) malloc ( sizeof ( ncclConnect * ) * comm -> nRanks ); // Store intermediate send/recvData structs for connect
struct ncclConnect ** recvData = ( ncclConnect ** ) malloc ( sizeof ( ncclConnect * ) * comm -> nRanks ); // Points to entries inside data for given recv connection within a channel
struct ncclConnect ** sendData = ( ncclConnect ** ) malloc ( sizeof ( ncclConnect * ) * comm -> nRanks ); // Points to entries inside data for given send connection within a channel
2023-02-04 01:43:38 +00:00
2023-06-21 20:54:24 -07:00
//NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
// First time initialization
2021-07-28 13:27:06 -07:00
for ( int i = 1 ; i < comm -> nRanks ; i ++ ) {
int bootstrapTag = ( i << 8 ) + ( graph ? graph -> id + 1 : 0 );
int recvPeer = ( comm -> rank - i + comm -> nRanks ) % comm -> nRanks ;
int sendPeer = ( comm -> rank + i ) % comm -> nRanks ;
2023-02-04 01:43:38 +00:00
uint64_t recvMask = comm -> connectRecv [ recvPeer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )];
uint64_t sendMask = comm -> connectSend [ sendPeer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )];
2021-07-28 13:27:06 -07:00
2023-06-21 20:54:24 -07:00
// Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer
// This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers
// The first N entries contain recvData, connection information for recv connections
// The next M entries contain sendData, connection information for send connections
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
data [ i ] = ( ncclConnect * ) malloc ( sizeof ( ncclConnect ) * 2 * MAXCHANNELS );
recvData [ i ] = data [ i ];
2021-07-28 13:27:06 -07:00
int sendChannels = 0 , recvChannels = 0 ;
2022-01-13 13:33:07 -08:00
int type ;
2023-02-04 01:43:38 +00:00
TIME_START ( 0 );
2021-07-28 13:27:06 -07:00
for ( int c = 0 ; c < MAXCHANNELS ; c ++ ) {
2023-02-04 01:43:38 +00:00
if ( recvMask & ( 1UL << c )) {
2023-06-21 20:54:24 -07:00
NCCLCHECKGOTO ( selectTransport < 0 > ( comm , graph , recvData [ i ] + recvChannels ++ , c , recvPeer , connIndex , & type ), ret , fail );
2022-01-13 13:33:07 -08:00
if ( type > highestType ) highestType = type ;
2021-07-28 13:27:06 -07:00
}
}
2023-02-04 01:43:38 +00:00
TIME_STOP ( 0 );
TIME_START ( 1 );
2023-06-21 20:54:24 -07:00
sendData [ i ] = recvData [ i ] + recvChannels ;
2021-07-28 13:27:06 -07:00
for ( int c = 0 ; c < MAXCHANNELS ; c ++ ) {
2023-02-04 01:43:38 +00:00
if ( sendMask & ( 1UL << c )) {
2023-06-21 20:54:24 -07:00
NCCLCHECKGOTO ( selectTransport < 1 > ( comm , graph , sendData [ i ] + sendChannels ++ , c , sendPeer , connIndex , & type ), ret , fail );
2022-01-13 13:33:07 -08:00
if ( type > highestType ) highestType = type ;
2021-07-28 13:27:06 -07:00
}
}
2023-02-04 01:43:38 +00:00
TIME_STOP ( 1 );
2021-07-28 13:27:06 -07:00
2023-02-04 01:43:38 +00:00
TIME_START ( 2 );
2021-07-28 13:27:06 -07:00
if ( sendPeer == recvPeer ) {
if ( recvChannels + sendChannels ) {
2023-06-21 20:54:24 -07:00
//NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
//NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
sendData [ i ] = data [ i ];
recvData [ i ] = data [ i ] + sendChannels ;
2021-07-28 13:27:06 -07:00
}
} else {
2023-06-21 20:54:24 -07:00
//if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
//if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
//if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
//if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
2021-07-28 13:27:06 -07:00
}
2023-02-04 01:43:38 +00:00
TIME_STOP ( 2 );
2023-06-21 20:54:24 -07:00
}
2021-07-28 13:27:06 -07:00
2023-06-21 20:54:24 -07:00
// Loop until all channels with all ranks have been connected
bool allChannelsConnected ;
allChannelsConnected = false ;
while ( ! allChannelsConnected ) {
allChannelsConnected = true ;
for ( int i = 1 ; i < comm -> nRanks ; i ++ ) {
int recvPeer = ( comm -> rank - i + comm -> nRanks ) % comm -> nRanks ;
int sendPeer = ( comm -> rank + i ) % comm -> nRanks ;
uint64_t recvMask = comm -> connectRecv [ recvPeer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )];
uint64_t sendMask = comm -> connectSend [ sendPeer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )];
int sendDataOffset = 0 ;
int recvDataOffset = 0 ;
for ( int c = 0 ; c < MAXCHANNELS ; c ++ ) {
TIME_START ( 3 );
if ( sendMask & ( 1UL << c )) {
struct ncclConnector * conn = comm -> channels [ c ]. peers [ sendPeer ] -> send + connIndex ;
// This connector hasn't completed connection yet
if ( conn -> connected == 0 ) {
//NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
if ( ret == ncclSuccess ) {
struct ncclDevChannelPeer * addr ;
conn -> connected = 1 ;
/* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
//CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
//CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if ( ret == ncclInProgress ) {
allChannelsConnected = false ;
}
}
}
TIME_STOP ( 3 );
// Start with recv channels
TIME_START ( 4 );
if ( recvMask & ( 1UL << c )) {
struct ncclConnector * conn = comm -> channels [ c ]. peers [ recvPeer ] -> recv + connIndex ;
// This connector hasn't completed connection yet
if ( conn -> connected == 0 ) {
//NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
if ( ret == ncclSuccess ) {
struct ncclDevChannelPeer * addr ;
conn -> connected = 1 ;
/* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
//CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
//CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if ( ret == ncclInProgress ) {
allChannelsConnected = false ;
}
}
}
TIME_STOP ( 4 );
2021-07-28 13:27:06 -07:00
}
}
2023-06-21 20:54:24 -07:00
}
// Clear all connect masks and free each connectInfo array
for ( int i = 1 ; i < comm -> nRanks ; i ++ ) {
int recvPeer = ( comm -> rank - i + comm -> nRanks ) % comm -> nRanks ;
int sendPeer = ( comm -> rank + i ) % comm -> nRanks ;
2023-02-04 01:43:38 +00:00
comm -> connectRecv [ recvPeer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )] = comm -> connectSend [ sendPeer + comm -> nRanks * ( connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0 )] = 0UL ;
2023-06-21 20:54:24 -07:00
free ( data [ i ]);
2021-07-28 13:27:06 -07:00
}
2023-02-04 01:43:38 +00:00
2023-06-21 20:54:24 -07:00
free ( data );
free ( sendData );
free ( recvData );
2022-01-13 13:33:07 -08:00
if ( highestTransportType != NULL ) * highestTransportType = highestType ;
2023-02-04 01:43:38 +00:00
TIME_PRINT ( "P2P Setup/Connect" );
exit :
2023-06-21 20:54:24 -07:00
//NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
//NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
2023-02-04 01:43:38 +00:00
return ret ;
fail :
goto exit ;
2021-07-28 13:27:06 -07:00
}
2021-03-19 12:58:13 -07:00
extern struct ncclTransport collNetTransport ;
// All ranks must participate in collNetSetup call
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
2021-04-30 16:57:36 -07:00
int ncclTransportCollNetSetup ( struct ncclComm * comm , struct ncclTopoGraph * collNetGraph , struct ncclChannel * channel , int masterRank , int masterPeer , int collNetGraphChannelId , int type ) {
2021-07-28 13:27:06 -07:00
int fail = 1 ;
2021-04-30 16:57:36 -07:00
int rank = comm -> rank ;
int nranks = comm -> nRanks ;
int nMasters = comm -> nNodes ;
2021-03-19 12:58:13 -07:00
int rankInCollNet = - 1 ;
int isMaster = ( rank == masterRank ) ? 1 : 0 ;
struct {
int collNetRank ;
ncclConnect connect ;
} sendrecvExchange ;
// check if we can connect to collnet, whose root is the nranks-th rank
struct ncclPeerInfo * myInfo = comm -> peerInfo + rank , * peerInfo = comm -> peerInfo + nranks ;
peerInfo -> rank = nranks ;
// send master receives connect info from peer recv master
2021-04-30 16:57:36 -07:00
if ( isMaster && type == collNetSend ) {
//NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)));
2021-03-19 12:58:13 -07:00
rankInCollNet = sendrecvExchange . collNetRank ;
2021-04-30 16:57:36 -07:00
TRACE ( NCCL_INIT , "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d" , rank , rankInCollNet , nMasters , masterPeer );
2021-03-19 12:58:13 -07:00
}
// select
2023-06-21 20:54:24 -07:00
struct ncclChannelPeer * root = channel -> peers [ nranks ];
2021-04-30 16:57:36 -07:00
// connector index: 0 for recv, 1 for send
struct ncclConnector * conn = ( type == collNetRecv ) ? root -> recv + type : root -> send + type ;
struct ncclTransportComm * transportComm = ( type == collNetRecv ) ? & ( collNetTransport . recv ) : & ( collNetTransport . send );
2021-03-19 12:58:13 -07:00
conn -> transportComm = transportComm ;
// setup
struct ncclConnect myConnect ;
2022-09-09 01:20:52 +00:00
if ( isMaster ) {
2021-04-30 16:57:36 -07:00
NCCLCHECK ( transportComm -> setup ( comm , collNetGraph , myInfo , peerInfo , & myConnect , conn , collNetGraphChannelId , type ));
2021-03-19 12:58:13 -07:00
}
// prepare connect handles
ncclResult_t res ;
struct {
int isMaster ;
ncclConnect connect ;
} * allConnects = NULL ;
ncclConnect * masterConnects = NULL ;
2021-04-30 16:57:36 -07:00
NCCLCHECK ( ncclCalloc ( & masterConnects , nMasters ));
if ( type == collNetRecv ) { // recv side: AllGather
2021-03-19 12:58:13 -07:00
// all ranks must participate
2021-04-30 16:57:36 -07:00
NCCLCHECK ( ncclCalloc ( & allConnects , nranks ));
allConnects [ rank ]. isMaster = isMaster ;
memcpy ( & ( allConnects [ rank ]. connect ), & myConnect , sizeof ( struct ncclConnect ));
2021-03-19 12:58:13 -07:00
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
// consolidate
2021-04-30 16:57:36 -07:00
int c = 0 ;
for ( int r = 0 ; r < nranks ; r ++ ) {
if ( allConnects [ r ]. isMaster ) {
memcpy ( masterConnects + c , & ( allConnects [ r ]. connect ), sizeof ( struct ncclConnect ));
if ( r == rank ) rankInCollNet = c ;
c ++ ;
}
}
2021-03-19 12:58:13 -07:00
} else { // send side : copy in connect info received from peer recv master
//if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
}
// connect
2022-09-09 01:20:52 +00:00
if ( isMaster ) {
2023-06-21 20:54:24 -07:00
NCCLCHECKGOTO ( transportComm -> connect ( comm , masterConnects , nMasters , rankInCollNet , conn ), res , cleanup );
struct ncclDevChannelPeer * devRoot ;
//CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
struct ncclConnInfo * devConnInfo = ( type == collNetRecv ) ? devRoot -> recv + type : devRoot -> send + type ;
//CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
2021-03-19 12:58:13 -07:00
}
// recv side sends connect info to send side
2021-04-30 16:57:36 -07:00
if ( isMaster && type == collNetRecv ) {
sendrecvExchange . collNetRank = rankInCollNet ;
2021-03-19 12:58:13 -07:00
//memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
2021-04-30 16:57:36 -07:00
//NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
TRACE ( NCCL_INIT , "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d" , rank , rankInCollNet , nMasters , masterPeer );
2021-03-19 12:58:13 -07:00
}
2022-09-09 01:20:52 +00:00
fail = 0 ;
2021-03-19 12:58:13 -07:00
cleanup :
2021-04-30 16:57:36 -07:00
if ( allConnects != NULL ) free ( allConnects );
if ( masterConnects != NULL ) free ( masterConnects );
2021-07-28 13:27:06 -07:00
return fail ;
2021-03-25 20:59:32 -07:00
}
2021-04-30 16:57:36 -07:00
ncclResult_t ncclTransportCollNetCheck ( struct ncclComm * comm , int collNetSetupFail ) {
// AllGather collNet setup results
2022-04-18 11:14:51 -07:00
int allGatherFailures [ NCCL_MAX_LOCAL_RANKS ] = { 0 };
allGatherFailures [ comm -> localRank ] = collNetSetupFail ;
//NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int)));
2022-01-13 13:33:07 -08:00
for ( int i = 0 ; i < comm -> localRanks ; i ++ ) {
2021-04-30 16:57:36 -07:00
if ( allGatherFailures [ i ] != 0 ) {
collNetSetupFail = 1 ;
break ;
}
}
if ( collNetSetupFail ) {
2022-04-18 11:14:51 -07:00
if ( comm -> localRank == 0 ) WARN ( "Cannot initialize CollNet, using point-to-point network instead" );
2021-05-26 09:24:34 -07:00
return ncclSystemError ;
}
return ncclSuccess ;
}
ncclResult_t ncclTransportCollNetFree ( struct ncclComm * comm ) {
// Free collNet resources
for ( int r = 0 ; r < comm -> nChannels ; r ++ ) {
struct ncclChannel * channel = comm -> channels + r ;
2023-06-21 20:54:24 -07:00
struct ncclChannelPeer * peer = channel -> peers [ comm -> nRanks ];
if ( peer ) {
if ( ncclAtomicRefCountDecrement ( & peer -> refCount ) == 0 ) {
for ( int b = 0 ; b < NCCL_MAX_CONNS ; b ++ ) {
struct ncclConnector * send = peer -> send + b ;
if ( send -> transportResources && send -> transportComm ) NCCLCHECK ( send -> transportComm -> free ( send ));
send -> transportResources = NULL ; // avoid double free
}
for ( int b = 0 ; b < NCCL_MAX_CONNS ; b ++ ) {
struct ncclConnector * recv = peer -> recv + b ;
if ( recv -> transportResources && recv -> transportComm ) NCCLCHECK ( recv -> transportComm -> free ( recv ));
recv -> transportResources = NULL ; // avoid double free
}
}
2021-04-30 16:57:36 -07:00
}
}
return ncclSuccess ;
}
2023-06-21 20:54:24 -07:00
ncclResult_t initTransportsRank_1 ( struct ncclComm * comm , struct allGatherInfo * allGather3Data ,
struct ncclTopoGraph & treeGraph , struct ncclTopoGraph & ringGraph , struct ncclTopoGraph & collNetGraph , struct ncclTopoGraph & nvlsGraph , struct ncclComm * parent ) {
2022-04-18 11:14:51 -07:00
// We use 2 AllGathers
// 1. { peerInfo, comm, compCap}
// 2. { nChannels, graphInfo, topoRanks }
2023-02-04 01:43:38 +00:00
ncclResult_t ret = ncclSuccess ;
2020-02-03 22:06:44 +00:00
int rank = comm -> rank ;
int nranks = comm -> nRanks ;
2023-02-04 01:43:38 +00:00
cpu_set_t affinitySave ;
2023-06-21 20:54:24 -07:00
//struct ncclTopoGraph ringGraph;
//struct ncclTopoGraph treeGraph;
//struct ncclTopoGraph collNetGraph;
//struct ncclTopoGraph nvlsGraph;
struct ncclTopoGraph * graphs [] = { & treeGraph , & ringGraph , & collNetGraph , & collNetGraph , & nvlsGraph , & nvlsGraph };
int nChannelsOrig ;
struct ncclTopoRanks ** allTopoRanks = NULL ;
int * nodesFirstRank = NULL , * nodesTreePatterns = NULL ;
int * rings = NULL ;
int * nvbPeers = NULL ;
struct ncclProxyConnector proxyConn ;
int * pxnPeers = NULL ;
int * topParentLocalRanks = NULL ;
int tpProxyRank ;
2020-02-03 22:06:44 +00:00
// AllGather1 - begin
2023-02-04 01:43:38 +00:00
//NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
//NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, commHash), ret, fail);
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
2020-12-01 11:33:47 -05:00
2023-06-21 20:54:24 -07:00
for ( int i = 0 ; i < nranks ; i ++ ) {
if (( i != rank ) && ( comm -> peerInfo [ i ]. hostHash == comm -> peerInfo [ rank ]. hostHash ) && ( comm -> peerInfo [ i ]. busId == comm -> peerInfo [ rank ]. busId )) {
WARN ( "Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx" , rank , i , comm -> peerInfo [ rank ]. busId );
ret = ncclInvalidUsage ;
goto fail ;
2020-12-01 11:33:47 -05:00
}
}
2020-02-03 22:06:44 +00:00
// AllGather1 - end
2023-02-04 01:43:38 +00:00
do {
// Compute intra-process ranks
int intraProcRank0 = - 1 , intraProcRank = - 1 , intraProcRanks = 0 ;
2023-06-21 20:54:24 -07:00
for ( int i = 0 ; i < nranks ; i ++ ) comm -> minCompCap = std :: min ( comm -> minCompCap , comm -> peerInfo [ rank ]. cudaCompCap );
for ( int i = 0 ; i < nranks ; i ++ ) comm -> maxCompCap = std :: max ( comm -> maxCompCap , comm -> peerInfo [ rank ]. cudaCompCap );
2023-02-04 01:43:38 +00:00
for ( int i = 0 ; i < nranks ; i ++ ) {
if (( comm -> peerInfo [ i ]. hostHash == comm -> peerInfo [ rank ]. hostHash )
&& ( comm -> peerInfo [ i ]. pidHash == comm -> peerInfo [ rank ]. pidHash )) {
// Rank is in same process
if ( intraProcRanks == 0 ) intraProcRank0 = i ;
if ( i == rank ) intraProcRank = intraProcRanks ;
intraProcRanks ++ ;
if ( intraProcRank0 == rank && rank != i ) {
comm -> peerInfo [ i ]. comm -> intraNext = comm -> intraNext ;
comm -> intraNext = comm -> peerInfo [ i ]. comm ;
}
}
}
TRACE ( NCCL_INIT , "pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d" ,
rank , comm -> peerInfo [ rank ]. pidHash , intraProcRank , intraProcRanks , intraProcRank0 );
if ( intraProcRank == - 1 || intraProcRank0 == - 1 || comm -> peerInfo [ intraProcRank0 ]. comm == NULL ) {
WARN ( "Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d" ,
rank , comm -> peerInfo [ rank ]. hostHash , comm -> peerInfo [ rank ]. pidHash ,
intraProcRank , intraProcRanks , intraProcRank0 );
ret = ncclInternalError ;
goto fail ;
}
struct ncclComm * comm0 = comm -> peerInfo [ intraProcRank0 ]. comm ;
assert ( intraProcRank == 0 ? comm == comm0 : true );
comm -> intraComm0 = comm0 ;
comm -> intraRank = intraProcRank ;
comm -> intraRanks = intraProcRanks ;
comm -> intraBarrierPhase = 0 ;
comm -> intraBarrierCounter = 0 ;
comm -> intraBarrierGate = 0 ;
} while ( 0 );
2020-02-03 22:06:44 +00:00
// Topo detection / System graph creation
2023-02-04 01:43:38 +00:00
//NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
2021-04-30 16:57:36 -07:00
// save nRanks to ncclTopoSystem as indicator of multi-node
2021-03-05 19:59:41 -08:00
comm -> topo -> nRanks = comm -> nRanks ;
2021-11-05 08:53:47 -07:00
// init netGdrLevel
comm -> topo -> netGdrLevel = - 2 ;
2022-09-09 01:20:52 +00:00
// init Pivot A2A related fields
comm -> topo -> pivotA2AEnabled = false ;
comm -> topo -> pivotA2ANumBiRings = 0 ;
2022-10-20 15:40:03 +00:00
// LL128
comm -> topo -> ll128Enabled = false ;
2023-03-15 05:34:25 +08:00
// Topology hint for MSCCL internal scheduler about whether to enable MSCCL
comm -> topo -> mscclEnabled = false ;
2020-02-03 22:06:44 +00:00
// Compute paths between GPUs and NICs
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoComputePaths ( comm -> topo , comm ), ret , fail );
2020-02-03 22:06:44 +00:00
// Remove inaccessible GPUs and unused NICs
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoTrimSystem ( comm -> topo , comm ), ret , fail );
2020-02-03 22:06:44 +00:00
// Recompute paths after trimming
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoComputePaths ( comm -> topo , comm ), ret , fail );
2020-03-03 11:42:40 -08:00
// Init search
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoSearchInit ( comm -> topo ), ret , fail );
2020-02-03 22:06:44 +00:00
// Print final topology
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoPrint ( comm -> topo ), ret , fail );
2020-02-03 22:06:44 +00:00
2022-04-18 11:14:51 -07:00
// Set Affinity to a CPU local the our GPU, so that all memory we allocate
// on the host is local.
2023-02-04 01:43:38 +00:00
//NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
//if (CPU_COUNT(&comm->cpuAffinity)) {
// sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
// sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
2022-04-18 11:14:51 -07:00
//}
2023-06-21 20:54:24 -07:00
// Determine local CollNet support
if ( collNetSupport ( comm )) {
char * collNetEnable = getenv ( "NCCL_COLLNET_ENABLE" );
if ( collNetEnable != NULL ) {
INFO ( NCCL_ALL , "NCCL_COLLNET_ENABLE set by environment to %s." , collNetEnable );
if ( strcmp ( collNetEnable , "1" ) == 0 ) {
comm -> collNetSupport = 1 ;
}
}
}
// Determine local Nvls support
//NCCLCHECK(ncclNvlsInit(comm));
2022-04-18 11:14:51 -07:00
2020-02-03 22:06:44 +00:00
// Get rings and trees
2020-03-03 11:42:40 -08:00
ringGraph . id = 0 ;
2020-02-03 22:06:44 +00:00
ringGraph . pattern = NCCL_TOPO_PATTERN_RING ;
2020-03-03 11:42:40 -08:00
ringGraph . collNet = 0 ;
ringGraph . minChannels = 1 ;
ringGraph . maxChannels = MAXCHANNELS / 2 ;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoCompute ( comm -> topo , & ringGraph ), ret , fail );
NCCLCHECKGOTO ( ncclTopoPrintGraph ( comm -> topo , & ringGraph ), ret , fail );
2020-02-03 22:06:44 +00:00
2020-03-03 11:42:40 -08:00
treeGraph . id = 1 ;
2021-07-28 13:27:06 -07:00
treeGraph . pattern = NCCL_TOPO_PATTERN_BALANCED_TREE ;
2020-03-03 11:42:40 -08:00
treeGraph . collNet = 0 ;
treeGraph . minChannels = comm -> topo -> nodes [ NET ]. count != 0 ? 1 : ringGraph . nChannels ;
treeGraph . maxChannels = ringGraph . nChannels ;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoCompute ( comm -> topo , & treeGraph ), ret , fail );
NCCLCHECKGOTO ( ncclTopoPrintGraph ( comm -> topo , & treeGraph ), ret , fail );
2020-03-03 11:42:40 -08:00
collNetGraph . id = 2 ;
collNetGraph . pattern = NCCL_TOPO_PATTERN_TREE ;
collNetGraph . collNet = 1 ;
2022-04-18 11:14:51 -07:00
collNetGraph . minChannels = collNetGraph . maxChannels = ringGraph . nChannels ;
2023-06-21 20:54:24 -07:00
if ( comm -> collNetSupport ) {
NCCLCHECKGOTO ( ncclTopoCompute ( comm -> topo , & collNetGraph ), ret , fail );
NCCLCHECKGOTO ( ncclTopoPrintGraph ( comm -> topo , & collNetGraph ), ret , fail );
} else {
collNetGraph . nChannels = 0 ;
}
nvlsGraph . id = 3 ;
nvlsGraph . pattern = NCCL_TOPO_PATTERN_NVLS ;
nvlsGraph . collNet = 0 ;
nvlsGraph . minChannels = 1 ;
nvlsGraph . maxChannels = MAXCHANNELS ;
if ( comm -> nvlsSupport ) {
NCCLCHECKGOTO ( ncclTopoCompute ( comm -> topo , & nvlsGraph ), ret , fail );
NCCLCHECKGOTO ( ncclTopoPrintGraph ( comm -> topo , & nvlsGraph ), ret , fail );
} else {
nvlsGraph . nChannels = 0 ;
}
2020-03-03 11:42:40 -08:00
2023-02-04 01:43:38 +00:00
bool allXgmi , hasPeerAccess ;
allXgmi = true ;
hasPeerAccess = true ;
2022-04-18 11:14:51 -07:00
// Check that all the GPUs have peer access to one another and are XGMI connected
for ( int i = 0 ; i < nranks && hasPeerAccess ; i ++ ) {
int cudaDev1 = comm -> peerInfo [ i ]. cudaDev ;
for ( int j = 0 ; j < nranks ; j ++ ) {
if ( i == j ) continue ;
int cudaDev2 = comm -> peerInfo [ j ]. cudaDev ;
int p2p ;
if ( hipDeviceCanAccessPeer ( & p2p , cudaDev1 , cudaDev2 ) != hipSuccess || ! p2p )
{
hasPeerAccess = false ;
break ;
}
bool isXGMI ;
// Limit to single intermediate GPU for enabling clique
NCCLCHECK ( ncclTopoGetLinkType ( comm -> topo , i , j , & isXGMI , 1 ));
allXgmi &= isXGMI ;
}
}
2022-10-20 15:40:03 +00:00
// Initialize num P2P LL buffers for this communicator
comm -> allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers () == 1 ;
2022-04-18 11:14:51 -07:00
2020-03-03 11:42:40 -08:00
if ( comm -> rank == ncclParamGraphDumpFileRank ()) {
2023-06-21 20:54:24 -07:00
struct ncclTopoGraph * dumpGraphs [ 4 ] = { & ringGraph , & treeGraph , & collNetGraph , & nvlsGraph };
NCCLCHECKGOTO ( ncclTopoDumpGraphs ( comm -> topo , 4 , dumpGraphs ), ret , fail );
2020-03-03 11:42:40 -08:00
}
2022-09-09 01:20:52 +00:00
if (( comm -> topo -> type & RCCL_TOPO_4P2H_ROME ) && ( comm -> topo -> type & RCCL_TOPO_GDR_ALL )) {
if ( rcclParamP2pNetDisable () == 0 ) {
if ( ! ( comm -> topo -> type & RCCL_TOPO_FORCE_INTRA )) comm -> p2pNet = 1 ;
INFO ( NCCL_INIT , "RCCL enabled same node P2P over network" );
}
else
INFO ( NCCL_INIT , "RCCL force disabled same node P2P over network" );
}
2020-02-03 22:06:44 +00:00
// AllGather3 - begin
2023-02-04 01:43:38 +00:00
//NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
2020-08-26 11:40:11 -07:00
int idx ;
2022-04-18 11:14:51 -07:00
NCCLCHECK ( ncclTopoIdToIndex ( comm -> topo , GPU , comm -> busId , & idx ));
2021-04-30 16:57:36 -07:00
allGather3Data [ rank ]. nc = 2 ;
2023-06-21 20:54:24 -07:00
if ( comm -> topo -> nodes [ GPU ]. count == comm -> topo -> nRanks &&
2022-10-20 15:40:03 +00:00
comm -> topo -> nodes [ GPU ]. nodes [ idx ]. gpu . gcn == 906 && allXgmi )
2021-07-27 17:32:41 -07:00
allGather3Data [ rank ]. nc = 4 ;
2021-07-27 08:30:08 -07:00
if ( comm -> topo -> nodes [ GPU ]. nodes [ idx ]. gpu . gcn == 908 )
allGather3Data [ rank ]. nc = std :: max ( 4 / ringGraph . nChannels , 2 );
2023-06-21 20:54:24 -07:00
if ( comm -> topo -> nodes [ GPU ]. count == comm -> topo -> nRanks &&
2022-10-20 15:40:03 +00:00
( comm -> topo -> type & RCCL_TOPO_CR8G ))
2021-04-30 16:57:36 -07:00
allGather3Data [ rank ]. nc = 4 ;
2023-06-21 20:54:24 -07:00
if ( comm -> topo -> nodes [ GPU ]. count == comm -> topo -> nRanks &&
2022-10-20 15:40:03 +00:00
comm -> topo -> nodes [ GPU ]. nodes [ idx ]. gpu . gcn == 910 )
2021-08-24 09:42:04 -07:00
allGather3Data [ rank ]. nc = 4 ;
2021-10-12 08:23:20 -07:00
if ( comm -> topo -> nodes [ GPU ]. nodes [ idx ]. gpu . gcn == 910 )
allGather3Data [ rank ]. nc = std :: max ( allGather3Data [ rank ]. nc , 4 / ringGraph . nChannels );
2022-01-14 10:03:30 -08:00
if ( ringGraph . nChannels > MAXCHANNELS / 2 )
allGather3Data [ rank ]. nc = 1 ;
2022-09-09 01:20:52 +00:00
allGather3Data [ rank ]. pivotA2AEnabled = comm -> topo -> pivotA2AEnabled && rcclParamPivotAlltoallEnable ();
2022-10-20 15:40:03 +00:00
comm -> topo -> ll128Enabled = comm -> topo -> ll128Enabled || rcclParamLL128ForceEnable ();
allGather3Data [ rank ]. ll128Enabled = comm -> topo -> ll128Enabled ;
2023-03-15 05:34:25 +08:00
allGather3Data [ rank ]. mscclEnabled = comm -> topo -> mscclEnabled ;
2020-02-03 22:06:44 +00:00
2023-06-21 20:54:24 -07:00
for ( int a = 0 ; a < NCCL_NUM_ALGORITHMS ; a ++ ) {
allGather3Data [ rank ]. graphInfo [ a ]. pattern = graphs [ a ] -> pattern ;
allGather3Data [ rank ]. graphInfo [ a ]. nChannels = graphs [ a ] -> nChannels ;
allGather3Data [ rank ]. graphInfo [ a ]. sameChannels = graphs [ a ] -> sameChannels ;
allGather3Data [ rank ]. graphInfo [ a ]. bwIntra = graphs [ a ] -> bwIntra ;
allGather3Data [ rank ]. graphInfo [ a ]. bwInter = graphs [ a ] -> bwInter ;
allGather3Data [ rank ]. graphInfo [ a ]. typeIntra = graphs [ a ] -> typeIntra ;
allGather3Data [ rank ]. graphInfo [ a ]. typeInter = graphs [ a ] -> typeInter ;
}
comm -> nChannels = std :: min ( treeGraph . nChannels , ringGraph . nChannels );
NCCLCHECKGOTO ( ncclTopoPreset ( comm , graphs , & allGather3Data [ rank ]. topoRanks ), ret , fail );
2023-02-04 01:43:38 +00:00
fail :
return ret ;
2020-02-03 22:06:44 +00:00
}
2023-06-21 20:54:24 -07:00
ncclResult_t initTransportsRank_3 ( struct ncclComm * comm , struct allGatherInfo * allGather3Data ,
struct ncclTopoGraph & treeGraph , struct ncclTopoGraph & ringGraph , struct ncclTopoGraph & collNetGraph , struct ncclTopoGraph & nvlsGraph ) {
ncclResult_t ret = ncclSuccess ;
2020-02-03 22:06:44 +00:00
int rank = comm -> rank ;
int nranks = comm -> nRanks ;
2023-06-21 20:54:24 -07:00
cpu_set_t affinitySave ;
struct ncclTopoGraph * graphs [] = { & treeGraph , & ringGraph , & collNetGraph , & collNetGraph , & nvlsGraph , & nvlsGraph };
2023-02-04 01:43:38 +00:00
int nChannelsOrig ;
struct ncclTopoRanks ** allTopoRanks = NULL ;
int * nodesFirstRank = NULL , * nodesTreePatterns = NULL ;
int * rings = NULL ;
int * nvbPeers = NULL ;
struct ncclProxyConnector proxyConn ;
int * pxnPeers = NULL ;
2023-06-21 20:54:24 -07:00
int * topParentLocalRanks = NULL ;
int tpProxyRank ;
2023-02-04 01:43:38 +00:00
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
2020-02-03 22:06:44 +00:00
// Determine nNodes, firstRanks, ...
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclCalloc ( & nodesFirstRank , nranks ), ret , fail );
NCCLCHECKGOTO ( ncclCalloc ( & nodesTreePatterns , nranks ), ret , fail );
NCCLCHECKGOTO ( ncclCalloc ( & comm -> rankToNode , comm -> nRanks ), ret , fail );
2022-04-18 11:14:51 -07:00
for ( int r = 0 ; r < nranks ; r ++ ) {
int node ;
int firstRank = allGather3Data [ r ]. topoRanks . ringRecv [ 0 ];
for ( node = 0 ; node < comm -> nNodes && nodesFirstRank [ node ] != firstRank ; node ++ );
if ( node == comm -> nNodes ) {
comm -> nNodes ++ ;
2020-02-03 22:06:44 +00:00
nodesFirstRank [ node ] = firstRank ;
2020-12-01 11:33:47 -05:00
// Record tree pattern of each node as they can be different depending on sm arch
2023-06-21 20:54:24 -07:00
nodesTreePatterns [ node ] = allGather3Data [ r ]. graphInfo [ NCCL_ALGO_TREE ]. pattern ;
2020-02-03 22:06:44 +00:00
}
2022-04-18 11:14:51 -07:00
comm -> rankToNode [ r ] = node ;
}
// Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclCalloc ( & comm -> nodeRanks , comm -> nNodes ), ret , fail );
NCCLCHECKGOTO ( ncclCalloc ( & comm -> rankToLocalRank , comm -> nRanks ), ret , fail );
2022-04-18 11:14:51 -07:00
for ( int r = 0 ; r < comm -> nRanks ; r ++ ) {
int node = comm -> rankToNode [ r ];
comm -> rankToLocalRank [ r ] = comm -> nodeRanks [ node ]. localRanks ;
comm -> nodeRanks [ node ]. localRanks ++ ;
}
// Allocate ranks arrays for each node
for ( int n = 0 ; n < comm -> nNodes ; n ++ ) {
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclCalloc ( & comm -> nodeRanks [ n ]. localRankToRank , comm -> nodeRanks [ n ]. localRanks ), ret , fail );
2022-04-18 11:14:51 -07:00
comm -> maxLocalRanks = std :: max ( comm -> maxLocalRanks , comm -> nodeRanks [ n ]. localRanks );
comm -> nodeRanks [ n ]. localRanks = 0 ;
}
// And fill the ranks arrays
for ( int r = 0 ; r < comm -> nRanks ; r ++ ) {
int node = comm -> rankToNode [ r ];
comm -> nodeRanks [ node ]. localRankToRank [ comm -> nodeRanks [ node ]. localRanks ++ ] = r ;
}
comm -> node = comm -> rankToNode [ rank ];
comm -> localRankToRank = comm -> nodeRanks [ comm -> node ]. localRankToRank ;
comm -> localRank = comm -> rankToLocalRank [ rank ];
comm -> localRanks = comm -> nodeRanks [ comm -> node ]. localRanks ;
TRACE ( NCCL_INIT , "hostHash[%d] %lx localRank %d localRanks %d localRank0 %d" ,
rank , comm -> peerInfo [ rank ]. hostHash , comm -> localRank , comm -> localRanks , comm -> localRankToRank [ 0 ]);
if ( comm -> localRank == - 1 || comm -> localRankToRank [ 0 ] == - 1 || comm -> localRanks == 0 ) {
WARN ( "Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d" ,
rank , comm -> peerInfo [ rank ]. hostHash , comm -> peerInfo [ rank ]. pidHash ,
comm -> localRank , comm -> localRanks , comm -> localRankToRank [ 0 ]);
2023-02-04 01:43:38 +00:00
ret = ncclInternalError ;
goto fail ;
2020-02-03 22:06:44 +00:00
}
2023-02-04 01:43:38 +00:00
nChannelsOrig = comm -> nChannels ;
NCCLCHECKGOTO ( ncclCalloc ( & allTopoRanks , comm -> nRanks ), ret , fail );
int nc ;
nc = allGather3Data [ 0 ]. nc ;
2020-02-03 22:06:44 +00:00
for ( int i = 0 ; i < nranks ; i ++ ) {
allTopoRanks [ i ] = & allGather3Data [ i ]. topoRanks ;
2021-04-05 17:51:56 -07:00
nc = std :: min ( allGather3Data [ i ]. nc , nc );
2020-02-03 22:06:44 +00:00
// Make sure we align all ranks so that the tuning is consistent across ranks
2022-09-09 01:20:52 +00:00
comm -> topo -> pivotA2AEnabled = comm -> topo -> pivotA2AEnabled && allGather3Data [ i ]. pivotA2AEnabled ;
2022-10-20 15:40:03 +00:00
comm -> topo -> ll128Enabled = comm -> topo -> ll128Enabled && allGather3Data [ i ]. ll128Enabled ;
2023-03-15 05:34:25 +08:00
comm -> topo -> mscclEnabled = comm -> topo -> mscclEnabled && allGather3Data [ i ]. mscclEnabled ;
2023-06-21 20:54:24 -07:00
for ( int a = 0 ; a < NCCL_NUM_ALGORITHMS ; a ++ ) {
graphs [ a ] -> nChannels = std :: min ( allGather3Data [ i ]. graphInfo [ a ]. nChannels , graphs [ a ] -> nChannels );
graphs [ a ] -> sameChannels = std :: min ( allGather3Data [ i ]. graphInfo [ a ]. sameChannels , graphs [ a ] -> sameChannels );
graphs [ a ] -> bwIntra = std :: min ( allGather3Data [ i ]. graphInfo [ a ]. bwIntra , graphs [ a ] -> bwIntra );
graphs [ a ] -> bwInter = std :: min ( allGather3Data [ i ]. graphInfo [ a ]. bwInter , graphs [ a ] -> bwInter );
graphs [ a ] -> typeIntra = std :: max ( allGather3Data [ i ]. graphInfo [ a ]. typeIntra , graphs [ a ] -> typeIntra );
graphs [ a ] -> typeInter = std :: max ( allGather3Data [ i ]. graphInfo [ a ]. typeInter , graphs [ a ] -> typeInter );
}
if ( graphs [ NCCL_ALGO_COLLNET_CHAIN ] -> nChannels == 0 ) comm -> collNetSupport = 0 ;
if ( graphs [ NCCL_ALGO_NVLS ] -> nChannels == 0 ) comm -> nvlsSupport = 0 ;
2020-02-03 22:06:44 +00:00
}
2020-12-01 11:33:47 -05:00
2021-05-26 09:24:34 -07:00
comm -> nChannels = treeGraph . nChannels = ringGraph . nChannels =
( comm -> topo -> nodes [ GPU ]. count != comm -> topo -> nRanks && comm -> topo -> nodes [ NET ]. count )
? std :: min ( treeGraph . nChannels , ringGraph . nChannels ) : ringGraph . nChannels ;
2020-02-03 22:06:44 +00:00
if ( comm -> nChannels < nChannelsOrig ) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
for ( int i = 0 ; i < comm -> nChannels ; i ++ ) memcpy ( comm -> channels + comm -> nChannels + i , comm -> channels + nChannelsOrig + i , sizeof ( struct ncclChannel ));
}
2022-04-18 11:14:51 -07:00
// Determine CollNet support after all-gather now that we know nNodes and each node localRanks
if ( comm -> collNetSupport == 1 ) {
int collNetNodeThreshold = ncclParamCollNetNodeThreshold ();
if ( comm -> nNodes < collNetNodeThreshold ) {
2021-07-28 13:27:06 -07:00
INFO ( NCCL_INIT , "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet" , comm -> nNodes , collNetNodeThreshold );
2022-04-18 11:14:51 -07:00
comm -> collNetSupport = 0 ;
}
for ( int n = 0 ; n < comm -> nNodes ; n ++ ) {
if ( comm -> nodeRanks [ n ]. localRanks > NCCL_MAX_DIRECT_ARITY + 1 ) {
WARN ( "CollNet currently only supports up to %d GPUs per node, disabling CollNet" , NCCL_MAX_DIRECT_ARITY + 1 );
comm -> collNetSupport = 0 ;
break ;
}
}
2021-07-28 13:27:06 -07:00
}
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclCalloc ( & rings , nranks * MAXCHANNELS ), ret , fail );
2020-02-03 22:06:44 +00:00
2023-06-21 20:54:24 -07:00
NCCLCHECKGOTO ( ncclTopoPostset ( comm , nodesFirstRank , nodesTreePatterns , allTopoRanks , rings , graphs , nc ), ret , fail );
2023-06-06 08:41:38 -07:00
if ( comm -> topo -> pivotA2ANumBiRings == 3 ) NCCLCHECK ( ncclTreeBasePostset ( comm , & treeGraph ));
2020-02-03 22:06:44 +00:00
// AllGather3 - end
TRACE ( NCCL_INIT , "rank %d nranks %d - BUILT %d TREES/RINGS" , rank , nranks , comm -> nChannels );
2023-06-06 08:41:38 -07:00
char line [ 1024 ];
2020-02-03 22:06:44 +00:00
line [ 0 ] = '\0' ;
for ( int c = 0 ; c < comm -> nChannels ; c ++ ) {
2020-12-01 11:33:47 -05:00
struct ncclTree * tree = & comm -> channels [ c ]. tree ;
snprintf ( line + strlen ( line ), 1023 - strlen ( line ), " [%d] %d/%d/%d->%d->%d" ,
c , tree -> down [ 0 ], tree -> down [ 1 ], tree -> down [ 2 ], rank , tree -> up );
2023-06-06 08:41:38 -07:00
INFO ( NCCL_GRAPH , "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx" , c , comm -> channels [ c ]. ring . prev ,
comm -> rank , comm -> channels [ c ]. ring . next , comm , comm -> nRanks , comm -> busId );
2020-02-03 22:06:44 +00:00
}
line [ 1023 ] = '\0' ;
2023-06-06 08:41:38 -07:00
INFO ( NCCL_INIT , "Trees%s comm %p nRanks %02d busId %lx" , line , comm , comm -> nRanks , comm -> busId );
2020-02-03 22:06:44 +00:00
2023-02-04 01:43:38 +00:00
//NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
2020-04-02 18:01:21 -07:00
2023-06-21 20:54:24 -07:00
// Compute nChannels per peer for p2p
NCCLCHECKGOTO ( ncclTopoComputeP2pChannels ( comm ), ret , fail );
/* until now, all info of comm should be known. We can initialize shared resources and
* map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
* all proxy operations. */
if ( comm -> sharedRes -> owner == comm ) {
comm -> sharedRes -> tpNLocalRanks = comm -> localRanks ;
comm -> sharedRes -> magic = comm -> magic ;
comm -> sharedRes -> tpNChannels = comm -> nChannels ;
comm -> sharedRes -> tpP2pNChannels = comm -> p2pnChannels ;
memcpy ( comm -> sharedRes -> tpRankToLocalRank , comm -> rankToLocalRank , sizeof ( int ) * comm -> nRanks );
}
NCCLCHECKGOTO ( ncclCalloc ( & topParentLocalRanks , comm -> localRanks ), ret , fail );
for ( int i = 0 ; i < comm -> localRanks ; ++ i ) {
int tpRank = comm -> topParentRanks [ comm -> localRankToRank [ i ]];
topParentLocalRanks [ i ] = comm -> sharedRes -> tpRankToLocalRank [ tpRank ];
}
comm -> topParentLocalRanks = topParentLocalRanks ;
// Launch proxy service thread, after this, the proxy calls can be used.
//NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
2020-02-03 22:06:44 +00:00
// Connect with prev/next for each ring
for ( int c = 0 ; c < comm -> nChannels ; c ++ ) {
struct ncclChannel * channel = comm -> channels + c ;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( setupChannel ( comm , c , rank , nranks , rings + c * nranks ), ret , fail );
2020-02-03 22:06:44 +00:00
if ( comm -> nRanks == 1 ) continue ;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTransportP2pConnect ( comm , c , 1 , & channel -> ring . prev , 1 , & channel -> ring . next , 0 ), ret , fail );
2020-12-01 11:33:47 -05:00
}
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTransportP2pSetup ( comm , & ringGraph , 0 ), ret , fail );
2022-09-09 01:20:52 +00:00
if ( ringGraph . nIntraChannels && rcclParamP2pNetDisable () == 0 ) {
2021-06-09 13:24:26 -07:00
comm -> useIntraNet = 1 ;
// Connect NET for intranode use
for ( int c = 0 ; c < comm -> nChannels ; c ++ ) {
struct ncclChannel * channel = comm -> channels + c ;
if ( comm -> nRanks == 1 ) continue ;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTransportP2pConnect ( comm , c , 1 , & channel -> ring . prev , 1 , & channel -> ring . next , NCCL_CONN_IDX_P2P_NET ), ret , fail );
2021-06-09 13:24:26 -07:00
}
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTransportP2pSetup ( comm , & ringGraph , NCCL_CONN_IDX_P2P_NET ), ret , fail );
2021-06-09 13:24:26 -07:00
}
2020-12-01 11:33:47 -05:00
INFO ( NCCL_INIT , "Connected all rings" );
// Connect Trees
for ( int c = 0 ; c < comm -> nChannels ; c ++ ) {
struct ncclChannel * channel = comm -> channels + c ;
if ( comm -> nRanks == 1 ) continue ;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTransportP2pConnect ( comm , c , NCCL_MAX_TREE_ARITY , channel -> tree . down , 1 , & channel -> tree . up , 0 ), ret , fail );
NCCLCHECKGOTO ( ncclTransportP2pConnect ( comm , c , 1 , & channel -> tree . up , NCCL_MAX_TREE_ARITY , channel -> tree . down , 0 ), ret , fail );
2020-03-03 11:42:40 -08:00
}
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTransportP2pSetup ( comm , & treeGraph , 0 ), ret , fail );
2020-12-01 11:33:47 -05:00
INFO ( NCCL_INIT , "Connected all trees" );
2020-03-03 11:42:40 -08:00
2023-06-21 20:54:24 -07:00
#if 0
// Setup NVLS
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
// And NVLS trees if needed
if (comm->nvlsSupport && comm->localRanks > 1) {
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail);
INFO(NCCL_INIT, "Connected NVLS tree");
}
#endif
#if CUDART_VERSION >= 12010
2020-03-03 11:42:40 -08:00
// Check if we can setup CollNet
2023-06-21 20:54:24 -07:00
if ( comm -> collNetSupport > 0 ) collNetTrySetup ( comm , parent , & collNetGraph );
#endif
2021-05-26 09:24:34 -07:00
2020-02-03 22:06:44 +00:00
TRACE ( NCCL_INIT , "rank %d nranks %d - CONNECTED %d RINGS AND TREES" , rank , nranks , comm -> nChannels );
2020-03-03 11:42:40 -08:00
2020-12-01 11:33:47 -05:00
// Compute time models for algorithm and protocol combinations
2023-06-21 20:54:24 -07:00
NCCLCHECKGOTO ( ncclTopoTuneModel ( comm , comm -> minCompCap , comm -> maxCompCap , graphs ), ret , fail );
2023-06-06 08:41:38 -07:00
INFO ( NCCL_INIT , "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer" , comm -> nChannels , comm -> nvlsChannels , comm -> p2pnChannels , comm -> p2pnChannelsPerPeer );
2022-04-18 11:14:51 -07:00
#if 0
2022-09-09 01:20:52 +00:00
do { // Setup p2p structures in comm->tasks
struct ncclTasks* tasks = &comm->tasks;
int node = comm->node;
int nNodes = comm->nNodes;
struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
int localRank = comm->localRank;
2023-06-21 20:54:24 -07:00
// We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8.
int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2);
tasks->p2pOrderSteps = comm->nNodes * steps;
tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, tasks->p2pOrderSteps);
tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
int i=0;
2022-09-09 01:20:52 +00:00
// schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
for (int d=0; d <= nNodes/4; d++) {
int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
int index = 0;
int delta = deltas[index];
sched_delta:
int recvNode = (node+nNodes-delta)%nNodes;
int sendNode = (node+delta)%nNodes;
for (int step=0; step < steps; step++) {
int recvIndex = (localRank-step+steps)%steps;
2023-06-21 20:54:24 -07:00
int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
tasks->p2pRecvOrder[i] = recvRank;
2022-09-09 01:20:52 +00:00
int sendIndex = (localRank+step)%steps;
2023-06-21 20:54:24 -07:00
int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
tasks->p2pSendOrder[i] = sendRank;
i++;
2022-09-09 01:20:52 +00:00
}
index++;
if (index == 1 && deltas[1] == deltas[0]) index++;
if (index == 2 && deltas[2] == deltas[0]) index++;
if (index == 3 && deltas[3] == deltas[2]) index++;
if (index == 3 && deltas[3] == deltas[1]) index++;
if (index < 4) {
delta = deltas[index];
goto sched_delta;
}
}
2023-06-21 20:54:24 -07:00
assert(i == tasks->p2pOrderSteps);
2022-09-09 01:20:52 +00:00
} while (0);
2022-04-18 11:14:51 -07:00
if (ncclParamNvbPreconnect()) {
// Connect p2p when using NVB path
int nvbNpeers;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
2022-04-18 11:14:51 -07:00
for (int r=0; r<nvbNpeers; r++) {
int peer = nvbPeers[r];
2022-09-09 01:20:52 +00:00
int channelId;
2022-04-18 11:14:51 -07:00
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail);
2023-06-21 20:54:24 -07:00
if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
2023-02-04 01:43:38 +00:00
comm->connectSend[peer] |= (1UL<<channelId);
2022-04-18 11:14:51 -07:00
}
}
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail);
2023-06-21 20:54:24 -07:00
if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
2023-02-04 01:43:38 +00:00
comm->connectRecv[peer] |= (1UL<<channelId);
2022-04-18 11:14:51 -07:00
}
}
}
2023-06-21 20:54:24 -07:00
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
2022-04-18 11:14:51 -07:00
}
#endif
// Connect to local net proxy
2023-06-21 20:54:24 -07:00
tpProxyRank = comm -> topParentRanks [ comm -> rank ];
//NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
//NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
2022-04-18 11:14:51 -07:00
// Then to remote ones when using PXN
2022-09-09 01:20:52 +00:00
if ( ncclPxnDisable ( comm ) == 0 ) {
2022-04-18 11:14:51 -07:00
int nranks ;
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO ( ncclTopoGetPxnRanks ( comm , & pxnPeers , & nranks ), ret , fail );
2022-04-18 11:14:51 -07:00
for ( int r = 0 ; r < nranks ; r ++ ) {
2023-06-21 20:54:24 -07:00
tpProxyRank = comm -> topParentRanks [ pxnPeers [ r ]];
//NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
//NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
2022-04-18 11:14:51 -07:00
}
}
2022-09-09 01:20:52 +00:00
#if 0
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
char* str = getenv("NCCL_LAUNCH_MODE");
enum ncclLaunchMode mode, modeOld;
if (str && strcasecmp(str, "GROUP") == 0) {
mode = ncclLaunchModeGroup;
} else {
mode = ncclLaunchModeParallel;
}
// In theory we could be racing with other communicators not associated with
// this one if the user is connecting to multiple ncclUniqueId's concurrently.
modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
}
}
2023-02-04 01:43:38 +00:00
// Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
// launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
2022-10-20 15:40:03 +00:00
2022-04-18 11:14:51 -07:00
/* Local intra-node barrier */
2023-02-04 01:43:38 +00:00
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
2022-09-09 01:20:52 +00:00
#endif
2020-04-02 18:01:21 -07:00
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
2020-12-01 11:33:47 -05:00
TRACE ( NCCL_INIT , "rank %d nranks %d - DONE" , rank , nranks );
2023-02-04 01:43:38 +00:00
exit :
//if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
2023-06-21 20:54:24 -07:00
/* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
* attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
* properly cleaned up. */
//if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm);
2023-02-04 01:43:38 +00:00
free ( allTopoRanks );
free ( nodesTreePatterns );
free ( nodesFirstRank );
//free(allGather3Data);
free ( rings );
free ( nvbPeers );
free ( pxnPeers );
return ret ;
fail :
goto exit ;
2020-02-03 22:06:44 +00:00
}
2021-07-27 08:30:08 -07:00
ncclResult_t rocm_smi_init () {
return ncclSuccess ;
}
2022-10-19 14:19:50 -07:00
ncclResult_t rocm_smi_getNumDevice ( uint32_t * num_devs ) {
return ncclSuccess ;
}
ncclResult_t rocm_smi_getDevicePciBusIdString ( uint32_t deviceIndex , char * busId , size_t len ) {
return ncclSuccess ;
}
2021-07-27 08:30:08 -07:00
ncclResult_t rocm_smi_getDeviceIndexByPciBusId ( const char * pciBusId , uint32_t * deviceIndex ) {
return ncclSuccess ;
}
2022-10-19 14:19:50 -07:00
ncclResult_t rocm_smi_getLinkInfo ( int srcIndex , int dstIndex , RSMI_IO_LINK_TYPE * rsmi_type , int * hops , int * count ) {
2021-07-27 08:30:08 -07:00
return ncclSuccess ;
}
2022-09-09 01:20:52 +00:00
int ncclNetVersion ( struct ncclComm * comm ) {
return 4 ;
}