Remove dependency to HSA_FORCE_FINE_GRAIN_PCIE flag for XGMI link
Šī revīzija ir iekļauta:
@@ -361,8 +361,13 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
if (isFineGrain)
|
||||
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
|
||||
if (isFineGrain) {
|
||||
hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained);
|
||||
if (e != hipSuccess) {
|
||||
*ptr = 0;
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
|
||||
+22
-10
@@ -8,8 +8,6 @@
|
||||
#include "ring.h"
|
||||
#include "param.h"
|
||||
|
||||
extern bool useFineGrainVramPcie;
|
||||
|
||||
NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
|
||||
|
||||
ncclResult_t initRing(struct ncclComm* comm, int ringid) {
|
||||
@@ -19,26 +17,40 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
|
||||
// Setup intermediate buffering
|
||||
ring->buffSize = ncclParamBuffsize();
|
||||
|
||||
// attempt to allocate buffers in fine grain
|
||||
const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
|
||||
struct ncclSendMem* sendMem;
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize, useFineGrainVramPcie));
|
||||
ncclCudaCalloc((char**)&sendMem, sendSize, true);
|
||||
ring->devMemSend = sendMem;
|
||||
|
||||
const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
|
||||
struct ncclRecvMem* recvMem;
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize, useFineGrainVramPcie));
|
||||
ncclCudaCalloc((char**)&recvMem, recvSize, true);
|
||||
ring->devMemRecv = recvMem;
|
||||
|
||||
TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
|
||||
|
||||
// Pre-configure send/recv pointers. Those are the default, they may change later.
|
||||
ring->recv.conn.buff = recvMem->buff;
|
||||
ring->recv.conn.llBuff = recvMem->llBuff;
|
||||
ring->recv.conn.tail = &recvMem->tail;
|
||||
ring->recv.conn.opCount = &recvMem->opCount;
|
||||
if (recvMem){
|
||||
ring->recv.conn.buff = recvMem->buff;
|
||||
ring->recv.conn.llBuff = recvMem->llBuff;
|
||||
ring->recv.conn.tail = &recvMem->tail;
|
||||
ring->recv.conn.opCount = &recvMem->opCount;
|
||||
} else {
|
||||
ring->recv.conn.buff = 0;
|
||||
ring->recv.conn.llBuff = 0;
|
||||
ring->recv.conn.tail = 0;
|
||||
ring->recv.conn.opCount = 0;
|
||||
}
|
||||
ring->recv.conn.direct = 0;
|
||||
ring->send.conn.head = &sendMem->head;
|
||||
ring->send.conn.llHead = &sendMem->llHead;
|
||||
|
||||
if (sendMem) {
|
||||
ring->send.conn.head = &sendMem->head;
|
||||
ring->send.conn.llHead = &sendMem->llHead;
|
||||
} else {
|
||||
ring->send.conn.head = 0;
|
||||
ring->send.conn.llHead = 0;
|
||||
}
|
||||
ring->send.conn.direct = 0;
|
||||
ring->send.conn.llStep = 0;
|
||||
ring->send.conn.llLastCleaning = 0;
|
||||
|
||||
@@ -101,8 +101,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
if (!useFineGrainVramPcie) p2p = 0;
|
||||
|
||||
if (p2p == 0) return ncclSuccess;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
@@ -119,8 +117,14 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
|
||||
link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
|
||||
}
|
||||
int nvlinkp2p = 0;
|
||||
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
|
||||
nvlinkp2p = CONNECT_NVLINK;
|
||||
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
|
||||
if (hops == 1)
|
||||
nvlinkp2p = CONNECT_NVLINK;
|
||||
} else {
|
||||
if (!useFineGrainVramPcie)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#else
|
||||
// Check for NVLink/NVswitch
|
||||
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user