From 5e158396115aa38bacda4ccca2627a82c65f894e Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Mon, 12 Jan 2026 21:09:16 +0400 Subject: [PATCH] Revert "SWDEV-566854 - Improve memory object handling (#1939)" (#2572) This reverts commit 39d84328932de5b9fbc26f958c0467d479072831. rocprim failures were introduced with the commit. Based on the @erman-gurses investigation: Based on the list here: 2789ea4...050e88e https://github.com/ROCm/TheRock/actions/runs/20864279671 -> e005f84 (FAILED) https://github.com/ROCm/TheRock/actions/runs/20867580342 -> 39d8432 (FAILED) https://github.com/ROCm/TheRock/actions/runs/20870979894 -> 88f4bb1 (PASSED) https://github.com/ROCm/TheRock/actions/runs/20872795557 -> 11d9472 (PASSED) So the issue comes from this commit SWDEV-566854 - Improve memory object handling (#1939) SHA: 39d8432 --- projects/clr/hipamd/src/hip_graph_helper.hpp | 30 +- .../clr/hipamd/src/hip_graph_internal.cpp | 18 +- .../clr/hipamd/src/hip_graph_internal.hpp | 76 +--- projects/clr/hipamd/src/hip_memory.cpp | 387 ++++++++---------- 4 files changed, 185 insertions(+), 326 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph_helper.hpp b/projects/clr/hipamd/src/hip_graph_helper.hpp index 7f1d70dd80..f157f4c8c9 100644 --- a/projects/clr/hipamd/src/hip_graph_helper.hpp +++ b/projects/clr/hipamd/src/hip_graph_helper.hpp @@ -34,23 +34,8 @@ hipError_t hipMemcpy2DValidateBuffer(const void* buf, size_t pitch, size_t width hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); -hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset, - bool read_write); - -hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes, - size_t dstOffset, size_t srcOffset); - -hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory, - size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, - size_t dstOffset, bool isAsync = true); - -hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory, - size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, - size_t srcOffset, bool isAsync = true); - -hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory, - size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, - size_t dstOffset, size_t srcOffset, bool isAsync = true); +hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, + hipMemcpyKind kind, hip::Stream& stream, bool isAsync = true); void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream); @@ -138,14 +123,5 @@ hipError_t ihipMemcpyAtoHValidate(hipArray_t srcArray, void* dstHost, amd::Coord hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams); -constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst) { - return hipHostToHost; -} -constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, amd::Memory* dst) { - return hipWriteBuffer; -} -constexpr hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, void* dst) { - return hipReadBuffer; -} -hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind); +hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind); } // namespace hip diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 17fd391db0..59a62a499f 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -77,8 +77,9 @@ amd::Monitor GraphNode::WorkerThreadLock_{}; hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t count, hipMemcpyKind kind) { - if (dst == nullptr || src == nullptr) { - return hipErrorInvalidValue; + hipError_t status = ihipMemcpy_validate(dst, src, count, kind); + if (status != hipSuccess) { + return status; } size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); @@ -95,19 +96,6 @@ hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t } } - if (srcMemory != nullptr) { - hipError_t status = ihipMemcpy_validate_memory(srcMemory, count, sOffset, /*read_write*/ false); - if (status != hipSuccess) { - return status; - } - } - if (dstMemory != nullptr) { - hipError_t status = ihipMemcpy_validate_memory(dstMemory, count, dOffset, /*read_write*/ true); - if (status != hipSuccess) { - return status; - } - } - return hipSuccess; } diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 64a13c0012..5b42917561 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -1856,16 +1856,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { amd::Memory* srcMemory = getMemoryObject(src_, sOffset); size_t dOffset = 0; amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); - - hip::MemcpyType memType = hipHostToHost; - if (srcMemory != nullptr && dstMemory == nullptr) { - memType = ihipGetMemcpyType(srcMemory, dst_); - } else if (srcMemory == nullptr && dstMemory != nullptr) { - memType = ihipGetMemcpyType(src_, dstMemory); - } else if (srcMemory != nullptr && dstMemory != nullptr) { - memType = ihipGetMemcpyType(srcMemory, dstMemory, kind_); - } - + hip::MemcpyType memType = ihipGetMemcpyType(src_, dst_, kind_); switch (memType) { case hipCopyBuffer: // D2H/H2D source/dst is pinned memory @@ -1923,24 +1914,8 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { if (!AMD_DIRECT_DISPATCH) { WorkerThreadLock_.lock(); } - - hip::MemcpyType type; - size_t dOffset, sOffset; - amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); - amd::Memory* srcMemory = getMemoryObject(src_, sOffset); - - if (dstMemory != nullptr && srcMemory != nullptr) { - status = ihipMemcpyCommand(command, dstMemory, srcMemory, count_, kind_, *stream, dOffset, - sOffset); - type = ihipGetMemcpyType(srcMemory, dstMemory, kind_); - } else if (dstMemory == nullptr && srcMemory != nullptr) { - status = ihipMemcpyCommand(command, dst_, srcMemory, count_, kind_, *stream, sOffset); - type = ihipGetMemcpyType(srcMemory, dst_); - } else if (dstMemory != nullptr && srcMemory == nullptr) { - status = ihipMemcpyCommand(command, dstMemory, src_, count_, kind_, *stream, dOffset); - type = ihipGetMemcpyType(src_, dstMemory); - } - + status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *stream); + hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_); if (type == hipCopyBuffer) { amd::CopyMemoryCommand* cpycmd = reinterpret_cast(command); amd::CopyMetadata copyMetadata = cpycmd->copyMetadata(); @@ -2089,18 +2064,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { } virtual bool GraphCaptureEnabled() override { if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) { - hip::MemcpyType type; - - size_t dOffset, sOffset; - amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); - amd::Memory* srcMemory = getMemoryObject(src_, sOffset); - - // The case below is only interested in hipCopyBuffer, - // which is only valid for device to device copies. - if (dstMemory != nullptr && srcMemory != nullptr) { - type = ihipGetMemcpyType(srcMemory, dstMemory, kind_); - } - + hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_); switch (type) { case hipCopyBuffer: return true; @@ -2147,21 +2111,7 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D { if (status != hipSuccess) { return status; } - - size_t devOffset, dOffset; - amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset); - amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); - - if (devMemory == nullptr) { - return hipErrorInvalidValue; - } - - if (dstMemory != nullptr) { - status = ihipMemcpyCommand(command, dstMemory, devMemory, count_, kind_, *stream, dOffset, devOffset); - } else { - status = ihipMemcpyCommand(command, dst_, devMemory, count_, kind_, *stream, devOffset); - } - + status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *stream); if (status != hipSuccess) { return status; } @@ -2256,21 +2206,7 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D { if (status != hipSuccess) { return status; } - - size_t devOffset, sOffset; - amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset); - amd::Memory* srcMemory = getMemoryObject(src_, sOffset); - - if (devMemory == nullptr) { - return hipErrorInvalidValue; - } - - if (srcMemory != nullptr) { - status = ihipMemcpyCommand(command, devMemory, srcMemory, count_, kind_, *stream, devOffset, sOffset); - } else { - status = ihipMemcpyCommand(command, devMemory, src_, count_, kind_, *stream, devOffset); - } - + status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *stream); if (status != hipSuccess) { return status; } diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 241067288a..f802c4b68c 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -174,7 +174,8 @@ hipError_t hipExternalMemoryGetMappedBuffer(void** devPtr, hipExternalMemory_t e auto buf = reinterpret_cast(extMem); // Validate bounds - if (bufferDesc->size > buf->getSize() || bufferDesc->offset > buf->getSize() - bufferDesc->size) { + if (bufferDesc->size > buf->getSize() || + bufferDesc->offset > buf->getSize() - bufferDesc->size) { HIP_RETURN(hipErrorInvalidValue); } @@ -460,194 +461,188 @@ bool IsHtoHMemcpyValid(void* dst, const void* src, hipMemcpyKind kind) { } // ================================================================================================ -hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset, - bool read_write) { - // Validate Mem Access in case of VMM Memory - if (!memObj->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], read_write)) { - return hipErrorUnknown; +hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { + if (dst == nullptr || src == nullptr) { + return hipErrorInvalidValue; + } + if (static_cast(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) { + return hipErrorInvalidMemcpyDirection; + } + size_t sOffset = 0; + amd::Memory* srcMemory = getMemoryObject(src, sOffset); + size_t dOffset = 0; + amd::Memory* dstMemory = getMemoryObject(dst, dOffset); + + if (srcMemory != nullptr) { + // Validate Mem Access in case of VMM Memory + if (!srcMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], false)) { + return hipErrorUnknown; + } + + // If the mem object is a VMM sub buffer (subbuffer has parent set), + // then use parent's size for validation. + if (srcMemory->parent() && (srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + srcMemory = srcMemory->parent(); + } + + // Size validation + if (sizeBytes > (srcMemory->getSize() - sOffset)) { + return hipErrorInvalidValue; + } } - // If the mem object is a VMM sub buffer (subbuffer has parent set), - // then use parent's size for validation. - if (memObj->parent() && (memObj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - memObj = memObj->parent(); + if (dstMemory != nullptr) { + // Validate Mem Access in case of VMM Memory + if (!dstMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], true)) { + return hipErrorUnknown; + } + + // If the mem object is a VMM sub buffer (subbuffer has parent set), + // then use parent's size for validation. + if (dstMemory->parent() && (dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + dstMemory = dstMemory->parent(); + } + + // Size validation + if (sizeBytes > (dstMemory->getSize() - dOffset)) { + return hipErrorInvalidValue; + } } - // Size validation - if (sizeBytes > (memObj->getSize() - offset)) { + // If src and dst ptr are null then kind must be either h2h or def. + if (!IsHtoHMemcpyValid(dst, src, kind)) { return hipErrorInvalidValue; } return hipSuccess; } -// ================================================================================================ -hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes, - size_t dstOffset, size_t srcOffset) { - hipError_t status; - - status = ihipMemcpy_validate_memory(srcMemory, sizeBytes, srcOffset, /*read_write*/ false); - if (status != hipSuccess) return status; - status = ihipMemcpy_validate_memory(dstMemory, sizeBytes, dstOffset, /*read_write*/ true); - if (status != hipSuccess) return status; - - return hipSuccess; -} - -// ================================================================================================ -hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind) { - if ((src->GetDeviceById() != dst->GetDeviceById()) && - ((src->getContext().devices().size() == 1) && (dst->getContext().devices().size() == 1))) { - return hipCopyBufferP2P; - } else if (kind == hipMemcpyDeviceToDeviceNoCU) { - return hipCopyBufferSDMA; - } - return hipCopyBuffer; -} - -// ================================================================================================ -// Helper class to manage common memcpy command state and cleanup -class MemcpyCommandHelper { - public: - MemcpyCommandHelper(hip::Stream& stream, bool isAsync) - : waitList_(), - copyMetadata_(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE), - pStream_(&stream), - queueDevice_(&stream.device()) {} - - ~MemcpyCommandHelper() { - // Cleanup: release waitList command if present - if (waitList_.size() > 0) { - waitList_[0]->release(); +hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind) { + size_t sOffset = 0; + amd::Memory* srcMemory = getMemoryObject(src, sOffset); + size_t dOffset = 0; + amd::Memory* dstMemory = getMemoryObject(dst, dOffset); + hip::MemcpyType type; + if (srcMemory == nullptr && dstMemory == nullptr) { + type = hipHostToHost; + } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) { + type = hipWriteBuffer; + } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { + type = hipReadBuffer; + } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { + // Check if the queue device doesn't match the device on any memory object. + // And any of them are not host allocation. + // Hence it's a P2P transfer, because the app has requested access to another GPU + if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) && + ((srcMemory->getContext().devices().size() == 1) && + (dstMemory->getContext().devices().size() == 1))) { + type = hipCopyBufferP2P; + } else if (kind == hipMemcpyDeviceToDeviceNoCU) { + type = hipCopyBufferSDMA; + } else { + type = hipCopyBuffer; } } - - // Non-copyable, non-movable - MemcpyCommandHelper(const MemcpyCommandHelper&) = delete; - MemcpyCommandHelper& operator=(const MemcpyCommandHelper&) = delete; - - amd::Command::EventWaitList& waitList() { return waitList_; } - amd::CopyMetadata& copyMetadata() { return copyMetadata_; } - hip::Stream*& pStream() { return pStream_; } - amd::Device* queueDevice() const { return queueDevice_; } - - // Helper to add wait command from stream - void addWaitCommand(hip::Stream& stream) { - amd::Command* cmd = stream.getLastQueuedCommand(true); - if (cmd != nullptr) { - waitList_.push_back(cmd); - } - } - - // Helper to switch stream and add wait command - void switchStreamAndWait(hip::Stream& originalStream, amd::Context& context) { - pStream_ = hip::getNullStream(context); - addWaitCommand(originalStream); - } - - // Common error handling for command creation - static hipError_t checkCommand(amd::Command* command) { - if (command == nullptr) { - return hipErrorOutOfMemory; - } - return hipSuccess; - } - - private: - amd::Command::EventWaitList waitList_; - amd::CopyMetadata copyMetadata_; - hip::Stream* pStream_; - amd::Device* queueDevice_; -}; - -// ================================================================================================ -hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory, - size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, - size_t dstOffset, bool isAsync) { - MemcpyCommandHelper helper(stream, isAsync); - - if (&stream.device() != dstMemory->GetDeviceById() && - !(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context()); - } - command = new amd::WriteMemoryCommand( - *helper.pStream(), CL_COMMAND_WRITE_BUFFER, helper.waitList(), *dstMemory->asBuffer(), - dstOffset, sizeBytes, srcMemory, 0, 0, helper.copyMetadata()); - return MemcpyCommandHelper::checkCommand(command); + return type; } // ================================================================================================ -hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory, - size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, - size_t srcOffset, bool isAsync) { - MemcpyCommandHelper helper(stream, isAsync); - - if (helper.queueDevice() != srcMemory->GetDeviceById() && - !(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context()); - } - command = new amd::ReadMemoryCommand(*helper.pStream(), CL_COMMAND_READ_BUFFER, helper.waitList(), - *srcMemory->asBuffer(), srcOffset, sizeBytes, - dstMemory, 0, 0, helper.copyMetadata()); - return MemcpyCommandHelper::checkCommand(command); -} - -// ================================================================================================ -hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory, - size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, - size_t dstOffset, size_t srcOffset, bool isAsync) { - MemcpyCommandHelper helper(stream, isAsync); - - hip::MemcpyType type = ihipGetMemcpyType(srcMemory, dstMemory, kind); +hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, + hipMemcpyKind kind, hip::Stream& stream, bool isAsync) { + amd::Command::EventWaitList waitList; + size_t sOffset = 0; + amd::Memory* srcMemory = getMemoryObject(src, sOffset); + size_t dOffset = 0; + amd::Memory* dstMemory = getMemoryObject(dst, dOffset); + amd::Device* queueDevice = &stream.device(); + amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE); + hip::MemcpyType type = ihipGetMemcpyType(src, dst, kind); + hip::Stream* pStream = &stream; switch (type) { + case hipWriteBuffer: + if (queueDevice != dstMemory->GetDeviceById() && + !(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); + amd::Command* cmd = stream.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } + } + command = new amd::WriteMemoryCommand(*pStream, CL_COMMAND_WRITE_BUFFER, waitList, + *dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0, + copyMetadata); + break; + case hipReadBuffer: + if (queueDevice != srcMemory->GetDeviceById() && + !(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); + amd::Command* cmd = stream.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } + } + command = new amd::ReadMemoryCommand(*pStream, CL_COMMAND_READ_BUFFER, waitList, + *srcMemory->asBuffer(), sOffset, sizeBytes, dst, 0, 0, + copyMetadata); + break; case hipCopyBufferP2P: - command = new amd::CopyMemoryP2PCommand( - stream, CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(), - *dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes); - { - hipError_t status = MemcpyCommandHelper::checkCommand(command); - if (status != hipSuccess) { - return status; - } - // Make sure runtime has valid memory for the command execution. P2P access - // requires page table mapping on the current device to another GPU memory - if (!static_cast(command)->validateMemory()) { - delete command; - return hipErrorInvalidValue; - } + command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList, + *srcMemory->asBuffer(), *dstMemory->asBuffer(), + sOffset, dOffset, sizeBytes); + if (command == nullptr) { + return hipErrorOutOfMemory; + } + // Make sure runtime has valid memory for the command execution. P2P access + // requires page table mapping on the current device to another GPU memory + if (!static_cast(command)->validateMemory()) { + delete command; + return hipErrorInvalidValue; } break; case hipCopyBufferSDMA: - helper.copyMetadata().copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA; + copyMetadata.copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA; case hipCopyBuffer: if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) && - helper.queueDevice() != srcMemory->GetDeviceById()) { - helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context()); + queueDevice != srcMemory->GetDeviceById()) { + pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); + amd::Command* cmd = stream.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } } else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) { // Scenarios such as DtoH where dst is pinned memory - if ((helper.queueDevice() != srcMemory->GetDeviceById()) && + if ((queueDevice != srcMemory->GetDeviceById()) && (dstMemory->getContext().devices().size() != 1) && !(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context()); + pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); + amd::Command* cmd = stream.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } // Scenarios such as HtoD where src is pinned memory - } else if ((helper.queueDevice() != dstMemory->GetDeviceById()) && + } else if ((queueDevice != dstMemory->GetDeviceById()) && (srcMemory->getContext().devices().size() != 1) && !(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context()); + pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); + amd::Command* cmd = stream.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } } } - command = new amd::CopyMemoryCommand( - *helper.pStream(), CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(), - *dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes, - helper.copyMetadata()); - break; - case hipHostToHost: - assert(false && "Unreachable case"); + command = new amd::CopyMemoryCommand(*pStream, CL_COMMAND_COPY_BUFFER, waitList, + *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, + dOffset, sizeBytes, copyMetadata); break; } - return MemcpyCommandHelper::checkCommand(command); + if (command == nullptr) { + return hipErrorOutOfMemory; + } + if (waitList.size() > 0) { + waitList[0]->release(); + } + return hipSuccess; } -// ================================================================================================ bool IsHtoHMemcpy(void* dst, const void* src) { size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); @@ -673,79 +668,43 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin // Skip if nothing needs writing. return hipSuccess; } - if (dst == nullptr || src == nullptr) { - return hipErrorInvalidValue; - } - if (static_cast(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) { - return hipErrorInvalidMemcpyDirection; + status = ihipMemcpy_validate(dst, src, sizeBytes, kind); + if (status != hipSuccess) { + return status; } if (src == dst && kind == hipMemcpyDefault) { return hipSuccess; } - size_t sOffset = 0; - amd::Memory* srcDeviceMemory = getMemoryObject(src, sOffset); + amd::Memory* srcMemory = getMemoryObject(src, sOffset); size_t dOffset = 0; - amd::Memory* dstDeviceMemory = getMemoryObject(dst, dOffset); - - // Handle kind vs memobject miss matches - if (kind == hipMemcpyDeviceToHost && srcDeviceMemory == nullptr) { - return hipErrorInvalidValue; - } - if (kind == hipMemcpyHostToDevice && dstDeviceMemory == nullptr) { - return hipErrorInvalidValue; - } + amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - amd::Command* command = nullptr; - if (srcDeviceMemory == nullptr && dstDeviceMemory == nullptr) { + hipMemoryType srcMemoryType = getMemoryType(srcMemory); + hipMemoryType dstMemoryType = getMemoryType(dstMemory); + + if (srcMemory == nullptr && dstMemory == nullptr) { ihipHtoHMemcpy(dst, src, sizeBytes, stream); return hipSuccess; - } else if (dstDeviceMemory == nullptr || srcDeviceMemory == nullptr) { + } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) || + ((srcMemory != nullptr) && (dstMemory == nullptr))) { // Unpinned copy wait behavior is enforced in the lower copy layers so skip // wait at top level except for MT path isHostAsync &= AMD_DIRECT_DISPATCH ? true : false; - if (dstDeviceMemory != nullptr) { - status = ihipMemcpy_validate_memory(dstDeviceMemory, sizeBytes, dOffset, /*read_write*/ true); - if (status != hipSuccess) { - return status; - } - status = - ihipMemcpyCommand(command, dstDeviceMemory, src, sizeBytes, kind, stream, dOffset, isHostAsync); - } else { - status = - ihipMemcpy_validate_memory(srcDeviceMemory, sizeBytes, sOffset, /*read_write*/ false); - if (status != hipSuccess) { - return status; - } - status = - ihipMemcpyCommand(command, dst, srcDeviceMemory, sizeBytes, kind, stream, sOffset, isHostAsync); + } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) { + // Device to Device copies do not need to host side synchronization. + if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) && + (!srcMemory->getUserData().sync_mem_ops_ || !dstMemory->getUserData().sync_mem_ops_)) { + isHostAsync = true; } - } else { - // Both are AMD memory - hipMemoryType srcMemoryType = getMemoryType(srcDeviceMemory); - hipMemoryType dstMemoryType = getMemoryType(dstDeviceMemory); - - status = ihipMemcpy_validate(dstDeviceMemory, srcDeviceMemory, sizeBytes, dOffset, sOffset); - if (status != hipSuccess) { - return status; + // Any Host to any Host need host side synchronization. + if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) { + isHostAsync = false; } - - if (srcDeviceMemory->GetDeviceById() == dstDeviceMemory->GetDeviceById()) { - // Device to Device copies do not need to host side synchronization. - if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) && - (!srcDeviceMemory->getUserData().sync_mem_ops_ || - !dstDeviceMemory->getUserData().sync_mem_ops_)) { - isHostAsync = true; - } - // Any Host to any Host need host side synchronization. - if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) { - isHostAsync = false; - } - } - status = ihipMemcpyCommand(command, dstDeviceMemory, srcDeviceMemory, sizeBytes, kind, stream, - dOffset, sOffset, isHostAsync); } + amd::Command* command = nullptr; + status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isHostAsync); if (status != hipSuccess) { return status; } @@ -753,7 +712,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin if (!isHostAsync) { command->queue()->finishCommand(command); } else if (!isGPUAsync) { - hip::Stream* pStream = hip::getNullStream(dstDeviceMemory->GetDeviceById()->context()); + hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); amd::Command::EventWaitList waitList; waitList.push_back(command); amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList); @@ -3725,7 +3684,7 @@ hipError_t ihipPointerGetAttributes(void* data, hipPointer_attribute attribute, case HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE: { if (memObj) { if (getMemoryType(memObj) == hipMemoryTypeHost) { - // host pointer, pinned or registered memory + // host pointer, pinned or registered memory *reinterpret_cast(data) = 0; } else if ((memObj->getMemFlags() & kManagedAlloc) == kManagedAlloc) { // managed allocation