diff --git a/src/include/core.h b/src/include/core.h index f524cd5014..2e803facbc 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -361,8 +361,13 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { template static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) { - if (isFineGrain) - CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained)); + if (isFineGrain) { + hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained); + if (e != hipSuccess) { + *ptr = 0; + return ncclInvalidUsage; + } + } else CUDACHECK(hipMalloc(ptr, nelem*sizeof(T))); CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T))); diff --git a/src/ring.cu b/src/ring.cu index 23e27571f0..293557fa39 100644 --- a/src/ring.cu +++ b/src/ring.cu @@ -8,8 +8,6 @@ #include "ring.h" #include "param.h" -extern bool useFineGrainVramPcie; - NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES); ncclResult_t initRing(struct ncclComm* comm, int ringid) { @@ -19,26 +17,40 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) { // Setup intermediate buffering ring->buffSize = ncclParamBuffsize(); + // attempt to allocate buffers in fine grain const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem); struct ncclSendMem* sendMem; - NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize, useFineGrainVramPcie)); + ncclCudaCalloc((char**)&sendMem, sendSize, true); ring->devMemSend = sendMem; const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; struct ncclRecvMem* recvMem; - NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize, useFineGrainVramPcie)); + ncclCudaCalloc((char**)&recvMem, recvSize, true); ring->devMemRecv = recvMem; TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize); // Pre-configure send/recv pointers. Those are the default, they may change later. - ring->recv.conn.buff = recvMem->buff; - ring->recv.conn.llBuff = recvMem->llBuff; - ring->recv.conn.tail = &recvMem->tail; - ring->recv.conn.opCount = &recvMem->opCount; + if (recvMem){ + ring->recv.conn.buff = recvMem->buff; + ring->recv.conn.llBuff = recvMem->llBuff; + ring->recv.conn.tail = &recvMem->tail; + ring->recv.conn.opCount = &recvMem->opCount; + } else { + ring->recv.conn.buff = 0; + ring->recv.conn.llBuff = 0; + ring->recv.conn.tail = 0; + ring->recv.conn.opCount = 0; + } ring->recv.conn.direct = 0; - ring->send.conn.head = &sendMem->head; - ring->send.conn.llHead = &sendMem->llHead; + + if (sendMem) { + ring->send.conn.head = &sendMem->head; + ring->send.conn.llHead = &sendMem->llHead; + } else { + ring->send.conn.head = 0; + ring->send.conn.llHead = 0; + } ring->send.conn.direct = 0; ring->send.conn.llStep = 0; ring->send.conn.llLastCleaning = 0; diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu index 6c706dc6a7..2e70f5b80c 100644 --- a/src/transport/p2p.cu +++ b/src/transport/p2p.cu @@ -101,8 +101,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin return ncclSuccess; } - if (!useFineGrainVramPcie) p2p = 0; - if (p2p == 0) return ncclSuccess; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) @@ -119,8 +117,14 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev)); } int nvlinkp2p = 0; - if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) - nvlinkp2p = CONNECT_NVLINK; + if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) { + if (hops == 1) + nvlinkp2p = CONNECT_NVLINK; + } else { + if (!useFineGrainVramPcie) + return ncclSuccess; + } + #else // Check for NVLink/NVswitch int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);