Remove dependency to HSA_FORCE_FINE_GRAIN_PCIE flag for XGMI link

Šī revīzija ir iekļauta:
Wenkai Du
2019-07-31 23:48:44 +00:00
vecāks 1969e89003
revīzija e7022e9196
3 mainīti faili ar 37 papildinājumiem un 16 dzēšanām
+7 -2
Parādīt failu
@@ -361,8 +361,13 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
if (isFineGrain) {
hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained);
if (e != hipSuccess) {
*ptr = 0;
return ncclInvalidUsage;
}
}
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+22 -10
Parādīt failu
@@ -8,8 +8,6 @@
#include "ring.h"
#include "param.h"
extern bool useFineGrainVramPcie;
NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
ncclResult_t initRing(struct ncclComm* comm, int ringid) {
@@ -19,26 +17,40 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
// Setup intermediate buffering
ring->buffSize = ncclParamBuffsize();
// attempt to allocate buffers in fine grain
const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
struct ncclSendMem* sendMem;
NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize, useFineGrainVramPcie));
ncclCudaCalloc((char**)&sendMem, sendSize, true);
ring->devMemSend = sendMem;
const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
struct ncclRecvMem* recvMem;
NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize, useFineGrainVramPcie));
ncclCudaCalloc((char**)&recvMem, recvSize, true);
ring->devMemRecv = recvMem;
TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
// Pre-configure send/recv pointers. Those are the default, they may change later.
ring->recv.conn.buff = recvMem->buff;
ring->recv.conn.llBuff = recvMem->llBuff;
ring->recv.conn.tail = &recvMem->tail;
ring->recv.conn.opCount = &recvMem->opCount;
if (recvMem){
ring->recv.conn.buff = recvMem->buff;
ring->recv.conn.llBuff = recvMem->llBuff;
ring->recv.conn.tail = &recvMem->tail;
ring->recv.conn.opCount = &recvMem->opCount;
} else {
ring->recv.conn.buff = 0;
ring->recv.conn.llBuff = 0;
ring->recv.conn.tail = 0;
ring->recv.conn.opCount = 0;
}
ring->recv.conn.direct = 0;
ring->send.conn.head = &sendMem->head;
ring->send.conn.llHead = &sendMem->llHead;
if (sendMem) {
ring->send.conn.head = &sendMem->head;
ring->send.conn.llHead = &sendMem->llHead;
} else {
ring->send.conn.head = 0;
ring->send.conn.llHead = 0;
}
ring->send.conn.direct = 0;
ring->send.conn.llStep = 0;
ring->send.conn.llLastCleaning = 0;
+8 -4
Parādīt failu
@@ -101,8 +101,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
return ncclSuccess;
}
if (!useFineGrainVramPcie) p2p = 0;
if (p2p == 0) return ncclSuccess;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
@@ -119,8 +117,14 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
}
int nvlinkp2p = 0;
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
nvlinkp2p = CONNECT_NVLINK;
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
if (hops == 1)
nvlinkp2p = CONNECT_NVLINK;
} else {
if (!useFineGrainVramPcie)
return ncclSuccess;
}
#else
// Check for NVLink/NVswitch
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);