diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h index 4d6b6f4daf..8456c4bd69 100644 --- a/projects/rccl/src/include/alloc.h +++ b/projects/rccl/src/include/alloc.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "rccl_vars.h" #if CUDART_VERSION >= 11030 @@ -31,6 +32,78 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); } template<> constexpr size_t ncclSizeOfT() { return 1; } +struct ncclSideStream { + cudaStream_t stream; + uint64_t refCount; +}; + +inline std::unordered_map sideStream; +inline pthread_mutex_t sideStreamLock = PTHREAD_MUTEX_INITIALIZER; +extern ncclResult_t getBusId(int cudaDev, int64_t *busId); + +static inline ncclResult_t ncclCreateSideStream(int cudaDev) { + ncclResult_t res = ncclSuccess; + int64_t busId; + NCCLCHECK(getBusId(cudaDev, &busId)); + pthread_mutex_lock(&sideStreamLock); + if (auto it = sideStream.find(busId); it != sideStream.end()) { + it->second.refCount++; + INFO(NCCL_ALLOC, "Side stream %p of dev %d busid %lx inc count to %ld", + it->second.stream, cudaDev, busId, it->second.refCount); + } else { + cudaStream_t stream; + CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), res, fail); + sideStream.emplace(busId, ncclSideStream{stream, 1}); + INFO(NCCL_ALLOC, "Created side stream %p of dev %d busid %lx", + stream, cudaDev, busId); + } +fail: + pthread_mutex_unlock(&sideStreamLock); + return res; +}; + +static inline ncclResult_t ncclDestroySideStream(int cudaDev) { + ncclResult_t res = ncclSuccess; + int64_t busId; + NCCLCHECK(getBusId(cudaDev, &busId)); + pthread_mutex_lock(&sideStreamLock); + if (auto it = sideStream.find(busId); it != sideStream.end()) { + it->second.refCount--; + if (it->second.refCount== 0) { + INFO(NCCL_ALLOC, "Destroyed side stream %p of dev %d busid %lx", + it->second.stream, cudaDev, busId); + CUDACHECKGOTO(cudaStreamDestroy(it->second.stream), res, fail); + sideStream.erase(it); + } else { + INFO(NCCL_ALLOC, "Side stream %p of dev %d busid %lx dec count to %ld", + it->second.stream, cudaDev, busId, it->second.refCount); + } + } else { + WARN("Side stream of dev %d busid %lx was not found for destroy", cudaDev, busId); + } +fail: + pthread_mutex_unlock(&sideStreamLock); + return res; +}; + +static inline ncclResult_t getSideStream(cudaStream_t *stream) { + int cudaDev; + int64_t busId; + CUDACHECK(cudaGetDevice(&cudaDev)); + NCCLCHECK(getBusId(cudaDev, &busId)); + pthread_mutex_lock(&sideStreamLock); + if (auto it = sideStream.find(busId); it != sideStream.end()) { + *stream = it->second.stream; + INFO(NCCL_ALLOC, "Found side stream %p of dev %d busid %lx count %ld", + it->second.stream, cudaDev, busId, it->second.refCount); + } else { + *stream = 0; + WARN("Side stream of dev %d busid %lx was not found", cudaDev, busId); + } + pthread_mutex_unlock(&sideStreamLock); + return ncclSuccess; +} + #if CUDART_VERSION >= 12020 static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) { @@ -362,7 +435,7 @@ finish: #define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__) template -ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, unsigned int flags = hipDeviceMallocDefault) { +ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, unsigned int flags = hipDeviceMallocDefault) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; @@ -370,13 +443,15 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); // Need a side stream so as not to interfere with graph capture. - cudaStream_t stream = sideStream; - if (stream == nullptr) + cudaStream_t stream, sidestream; + NCCLCHECK(getSideStream(&sidestream)); + stream = sidestream; + if (sidestream == nullptr) CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT(), flags), result, finish); CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT(), stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); - if (sideStream == nullptr) + if (sidestream == nullptr) CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); @@ -428,11 +503,15 @@ ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); // Need a side stream so as not to interfere with graph capture. - cudaStream_t stream; - CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); + cudaStream_t stream, sidestream; + NCCLCHECK(getSideStream(&sidestream)); + stream = sidestream; + if (sidestream == nullptr) + CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); - CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); + if (sidestream == nullptr) + CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h index a1ac1234f5..65bb69b636 100644 --- a/projects/rccl/src/include/comm.h +++ b/projects/rccl/src/include/comm.h @@ -629,8 +629,6 @@ struct ncclComm { struct ncclKernelPlanner planner; - hipStream_t sideStream; // [RCCL] Cached non-captured stream - cudaMemPool_t memPool; // Queue of events and associated callbacks for cleaning up asynchronous work. // Using this is preferable to using CUDA host callbacks because host callbacks diff --git a/projects/rccl/src/include/gdrwrap.h b/projects/rccl/src/include/gdrwrap.h index 1954f00d1d..b3508a4b53 100644 --- a/projects/rccl/src/include/gdrwrap.h +++ b/projects/rccl/src/include/gdrwrap.h @@ -180,7 +180,7 @@ static gdr_t ncclGdrInit() { } template -static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) { +static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) { // gdr_info_t info; // unused variable - compiler warning size_t mapSize; // gdr_mh_t mh; // unused variable - compiler warning @@ -193,9 +193,9 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too #if defined(HIP_UNCACHED_MEMORY) - NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocUncached)); + NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, hipDeviceMallocUncached)); #else - NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocFinegrained)); + NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, hipDeviceMallocFinegrained)); #endif gdr_mem_desc_t* md; NCCLCHECK(ncclCalloc(&md, 1)); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 5ef4c40481..f7aefd163b 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -94,7 +94,7 @@ NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT); struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {}; -static ncclResult_t commReclaim(ncclComm_t comm); +ncclResult_t commReclaim(ncclComm_t comm); #ifdef ENABLE_MSCCLPP size_t std::hash::operator ()(const ncclUniqueId& uniqueId) const noexcept { @@ -521,7 +521,6 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(dtor->fn(dtor)); dtor = dtor->next; } - CUDACHECK(hipStreamDestroy(comm->sideStream)); ncclMemoryStackDestruct(&comm->memScoped); ncclMemoryStackDestruct(&comm->memPermanent); @@ -544,6 +543,7 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclNvlsSymmetricFinalize(comm)); NCCLCHECK(ncclIpcSymmetricFinalize(comm)); } + NCCLCHECK(ncclDestroySideStream(comm->cudaDev)); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy"); commPoison(comm); // poison comm before free to avoid comm reuse. @@ -650,6 +650,9 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in comm->lastStream = nullptr; CUDACHECK(cudaGetDevice(&comm->cudaDev)); + // RCCL: create persistent stream for calloc + NCCLCHECK(ncclCreateSideStream(comm->cudaDev)); + // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support. // but can be enabled via environment variable if (rcclParamEnableContextTracking() == 1) { @@ -666,9 +669,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in comm->compCap = ncclCudaCompCap(); TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap); - // RCCL: create persistent stream for calloc - CUDACHECK(hipStreamCreateWithFlags(&comm->sideStream, hipStreamNonBlocking)); - comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; @@ -819,7 +819,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { // The workFifoBuf lives in GDR mapped CUDA memory. - NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle, comm->sideStream), ret, fail); + NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle), ret, fail); ncclCommPushCudaGdrFree(comm, comm->workFifoBufGdrHandle); } else { // The workFifoBuf lives in cudaHost memory. @@ -876,7 +876,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { #endif #ifdef ENABLE_PROFILING - NCCLCHECK(ncclCudaCalloc(&tmpCommAndChans.comm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES, comm->sideStream)); + NCCLCHECK(ncclCudaCalloc(&tmpCommAndChans.comm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES)); #endif #ifdef ENABLE_FAULT_INJECTION diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc index d1420e8996..3b2d2f7890 100644 --- a/projects/rccl/src/transport/coll_net.cc +++ b/projects/rccl/src/transport/coll_net.cc @@ -409,9 +409,9 @@ static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int if (cuda && collNet->cudaBuff == NULL) { #if defined(HIP_UNCACHED_MEMORY) - NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault)); + NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault)); #else - NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); + NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); #endif } if (!cuda && collNet->hostBuff == NULL) { @@ -504,7 +504,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; @@ -582,7 +582,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc index 23e8ebce3d..a4a06ffe12 100644 --- a/projects/rccl/src/transport/net.cc +++ b/projects/rccl/src/transport/net.cc @@ -597,14 +597,14 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int } else { #if defined(HIP_UNCACHED_MEMORY) #if defined(HIP_CONTIGUOUS_MEMORY) - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, cuda ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault)); #else - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault)); #endif #else - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); #endif } @@ -888,14 +888,14 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } else { #if defined(HIP_UNCACHED_MEMORY) #if defined(HIP_CONTIGUOUS_MEMORY) - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault)); #else - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault)); #endif #else - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); #endif } @@ -914,7 +914,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, nullptr)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; @@ -1091,14 +1091,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str } else { #if defined(HIP_UNCACHED_MEMORY) #if defined(HIP_CONTIGUOUS_MEMORY) - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault)); #else - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault)); #endif #else - NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault)); #endif } @@ -1109,7 +1109,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && map->sameProcess) { uint64_t *cpuPtr, *gpuPtr; - NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, nullptr)); + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index 4262fe50bf..7af56a6c95 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -1855,9 +1855,9 @@ ib_recv: if (rComm->flushEnabled) { if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) { #if defined(HIP_UNCACHED_MEMORY) - NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocUncached), ret, fail); + NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), hipDeviceMallocUncached), ret, fail); #else - NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocFinegrained), ret, fail); + NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), hipDeviceMallocFinegrained), ret, fail); #endif if (useDmaBuf) { diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index a0b00f4765..de96530d18 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -247,9 +247,9 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe } else { // Allocate a CUDA buffer and generate an IPC handle for it #if defined(HIP_UNCACHED_MEMORY) - NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocUncached)); + NCCLCHECK(ncclCudaCalloc((char **)ptr, size, hipDeviceMallocUncached)); #else - NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocFinegrained)); + NCCLCHECK(ncclCudaCalloc((char **)ptr, size, hipDeviceMallocFinegrained)); #endif cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr); if (res != cudaSuccess) { @@ -667,9 +667,9 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st connection->transportResources = proxyInfo; #if defined(HIP_UNCACHED_MEMORY) - NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocUncached)); + NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], hipDeviceMallocUncached)); #else - NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocFinegrained)); + NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], hipDeviceMallocFinegrained)); #endif // Create a SHM segment for the peer to attach to