From dfefc97178b9ecdda19cd76011697faac39dac69 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Tue, 21 Feb 2023 16:32:29 -0800 Subject: [PATCH] SWDEV-384658 - Optimize D2D memcpy - Intra device memcpy does not need to perform host side synchronization - Check alloc flags when determining memory type Change-Id: Ieff28bd8d62756ffe82905354c4a91e9717e6bd4 --- hipamd/src/hip_graph_internal.cpp | 22 +++++--- hipamd/src/hip_memory.cpp | 92 ++++++++++++++++++++++--------- 2 files changed, 80 insertions(+), 34 deletions(-) diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index 328cf4ca3b..2d1e74c728 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -110,8 +110,9 @@ hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParam // {src/dst}Array is ignored. hipMemoryType srcMemoryType = pCopy.srcMemoryType; if (srcMemoryType == hipMemoryTypeUnified) { - srcMemoryType = - getMemoryObject(pCopy.srcDevice, offset) ? hipMemoryTypeDevice : hipMemoryTypeHost; + amd::Memory* memObj = getMemoryObject(pCopy.srcDevice, offset); + srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice; if (srcMemoryType == hipMemoryTypeHost) { // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system // memory. @@ -122,9 +123,10 @@ hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParam offset = 0; hipMemoryType dstMemoryType = pCopy.dstMemoryType; if (dstMemoryType == hipMemoryTypeUnified) { - dstMemoryType = - getMemoryObject(pCopy.dstDevice, offset) ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeHost) { + amd::Memory* memObj = getMemoryObject(pCopy.dstDevice, offset); + dstMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice; + if (dstMemoryType == hipMemoryTypeHost) { const_cast(&pCopy)->dstHost = pCopy.dstDevice; const_cast(&pCopy)->dstXInBytes += offset; } @@ -133,8 +135,9 @@ hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParam // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned. // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. if (srcMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = getMemoryObject(pCopy.srcHost, offset); - srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + srcMemoryType = getMemoryObject(pCopy.srcHost, offset) ? hipMemoryTypeDevice : + hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeDevice) { const_cast(&pCopy)->srcDevice = const_cast(pCopy.srcHost); const_cast(&pCopy)->srcXInBytes += offset; @@ -142,8 +145,9 @@ hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParam } offset = 0; if (dstMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = getMemoryObject(pCopy.dstHost, offset); - dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + dstMemoryType = getMemoryObject(pCopy.dstHost, offset) ? hipMemoryTypeDevice : + hipMemoryTypeHost; + if (dstMemoryType == hipMemoryTypeDevice) { const_cast(&pCopy)->dstDevice = const_cast(pCopy.dstDevice); const_cast(&pCopy)->dstXInBytes += offset; diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index fc29d0c1ec..8b92754ae3 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -471,10 +471,18 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin if (srcMemory == nullptr && dstMemory == nullptr) { ihipHtoHMemcpy(dst, src, sizeBytes, stream); return hipSuccess; - } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - isHostAsync = false; - } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { + } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) || + ((srcMemory != nullptr) && (dstMemory == nullptr))) { isHostAsync = false; + } else { + hipMemoryType srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + srcMemory->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice; + hipMemoryType dstMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + dstMemory->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice; + // Device to Device copies do not need to host side synchronization. + if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) { + isHostAsync = true; + } } amd::Command* command = nullptr; @@ -2043,12 +2051,17 @@ hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3 hip::Stream* stream) { size_t offset = 0; // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the - // (unified virtual address space) base address of the source data and the bytes per row to apply. - // {src/dst}Array is ignored. + // (unified virtual address space) base address of the source data and the bytes per row to + // apply. {src/dst}Array is ignored. hipMemoryType srcMemoryType = pCopy->srcMemoryType; if (srcMemoryType == hipMemoryTypeUnified) { - srcMemoryType = - getMemoryObject(pCopy->srcDevice, offset) ? hipMemoryTypeDevice : hipMemoryTypeHost; + amd::Memory* memObj = getMemoryObject(pCopy->srcDevice, offset); + if (memObj != nullptr) { + srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice; + } else { + srcMemoryType = hipMemoryTypeHost; + } if (srcMemoryType == hipMemoryTypeHost) { // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system // memory. @@ -2059,9 +2072,15 @@ hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3 offset = 0; hipMemoryType dstMemoryType = pCopy->dstMemoryType; if (dstMemoryType == hipMemoryTypeUnified) { - dstMemoryType = - getMemoryObject(pCopy->dstDevice, offset) ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeHost) { + amd::Memory* memObj = getMemoryObject(pCopy->dstDevice, offset); + if (memObj != nullptr) { + dstMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice; + } else { + dstMemoryType = hipMemoryTypeHost; + } + + if (dstMemoryType == hipMemoryTypeHost) { const_cast(pCopy)->dstHost = pCopy->dstDevice; const_cast(pCopy)->dstXInBytes += offset; } @@ -2071,8 +2090,9 @@ hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3 // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. offset = 0; if (srcMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = getMemoryObject(pCopy->srcHost, offset); - srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + srcMemoryType = getMemoryObject(pCopy->srcHost, offset) ? hipMemoryTypeDevice : + hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeDevice) { const_cast(pCopy)->srcDevice = const_cast(pCopy->srcHost); const_cast(pCopy)->srcXInBytes += offset; @@ -2080,8 +2100,9 @@ hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3 } offset = 0; if (dstMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = getMemoryObject(pCopy->dstHost, offset); - dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + dstMemoryType = getMemoryObject(pCopy->dstHost, offset) ? hipMemoryTypeDevice : + hipMemoryTypeHost; + if (dstMemoryType == hipMemoryTypeDevice) { const_cast(pCopy)->dstDevice = const_cast(pCopy->dstHost); const_cast(pCopy)->dstXInBytes += offset; @@ -2171,7 +2192,14 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool // base address of the source data and the bytes per row to apply. {src/dst}Array is ignored. hipMemoryType srcMemoryType = pCopy->srcMemoryType; if (srcMemoryType == hipMemoryTypeUnified) { - srcMemoryType = getMemoryObject(pCopy->srcDevice, offset) ? hipMemoryTypeDevice : hipMemoryTypeHost; + amd::Memory* memObj = getMemoryObject(pCopy->srcDevice, offset); + if (memObj != nullptr) { + srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice; + } else { + srcMemoryType = hipMemoryTypeHost; + } + if (srcMemoryType == hipMemoryTypeHost) { // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system memory. const_cast(pCopy)->srcHost = pCopy->srcDevice; @@ -2181,8 +2209,15 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool offset = 0; hipMemoryType dstMemoryType = pCopy->dstMemoryType; if (dstMemoryType == hipMemoryTypeUnified) { - dstMemoryType = getMemoryObject(pCopy->dstDevice, offset) ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeHost) { + amd::Memory* memObj = getMemoryObject(pCopy->dstDevice, offset); + if (memObj != nullptr) { + dstMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & + memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice; + } else { + dstMemoryType = hipMemoryTypeHost; + } + + if (dstMemoryType == hipMemoryTypeHost) { const_cast(pCopy)->dstHost = pCopy->dstDevice; const_cast(pCopy)->dstXInBytes += offset; } @@ -2191,13 +2226,14 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. offset = 0; if (srcMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = getMemoryObject(pCopy->srcHost, offset); - srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + srcMemoryType = getMemoryObject(pCopy->srcHost, offset) ? hipMemoryTypeDevice : + hipMemoryTypeHost; } if (dstMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = getMemoryObject(pCopy->dstHost, offset); - dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + dstMemoryType = getMemoryObject(pCopy->dstHost, offset) ? hipMemoryTypeDevice : + hipMemoryTypeHost; } + if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) { amd::Coord3D srcOrigin = {pCopy->srcXInBytes, pCopy->srcY, pCopy->srcZ}; amd::Coord3D dstOrigin = {pCopy->dstXInBytes, pCopy->dstY, pCopy->dstZ}; @@ -2217,11 +2253,17 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool status = ihipGetMemcpyParam3DCommand(command, pCopy, hip_stream); if (status != hipSuccess) return status; - // Transfers from device memory to pageable host memory and transfers from any host memory to any host memory - // are synchronous with respect to the host. + // Transfers from device memory to pageable host memory and transfers from any + // host memory to any host memory are synchronous with respect to the host. + // Device to Device copies do not need to host side synchronization. if (dstMemoryType == hipMemoryTypeHost || - ((pCopy->srcMemoryType == hipMemoryTypeHost) && (pCopy->dstMemoryType == hipMemoryTypeHost))) { - isAsync = false; + ((pCopy->srcMemoryType == hipMemoryTypeHost) && + (pCopy->dstMemoryType == hipMemoryTypeHost))) { + isAsync = false; + } else if ((pCopy->srcMemoryType == hipMemoryTypeDevice) && + (pCopy->dstMemoryType == hipMemoryTypeDevice)) { + // Device to Device copies dont need to wait for host synchronization + isAsync = true; } return ihipMemcpyCmdEnqueue(command, isAsync); }