diff --git a/projects/clr/hipamd/src/hip_graph_helper.hpp b/projects/clr/hipamd/src/hip_graph_helper.hpp index f157f4c8c9..7f1d70dd80 100644 --- a/projects/clr/hipamd/src/hip_graph_helper.hpp +++ b/projects/clr/hipamd/src/hip_graph_helper.hpp @@ -34,8 +34,23 @@ hipError_t hipMemcpy2DValidateBuffer(const void* buf, size_t pitch, size_t width hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); -hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, hip::Stream& stream, bool isAsync = true); +hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset, + bool read_write); + +hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes, + size_t dstOffset, size_t srcOffset); + +hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory, + size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, + size_t dstOffset, bool isAsync = true); + +hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory, + size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, + size_t srcOffset, bool isAsync = true); + +hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory, + size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, + size_t dstOffset, size_t srcOffset, bool isAsync = true); void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream); @@ -123,5 +138,14 @@ hipError_t ihipMemcpyAtoHValidate(hipArray_t srcArray, void* dstHost, amd::Coord hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams); -hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind); +constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst) { + return hipHostToHost; +} +constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, amd::Memory* dst) { + return hipWriteBuffer; +} +constexpr hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, void* dst) { + return hipReadBuffer; +} +hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind); } // namespace hip diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 253036753f..09ce75ff36 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -77,9 +77,8 @@ amd::Monitor GraphNode::WorkerThreadLock_{}; hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t count, hipMemcpyKind kind) { - hipError_t status = ihipMemcpy_validate(dst, src, count, kind); - if (status != hipSuccess) { - return status; + if (dst == nullptr || src == nullptr) { + return hipErrorInvalidValue; } size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); @@ -96,6 +95,19 @@ hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t } } + if (srcMemory != nullptr) { + hipError_t status = ihipMemcpy_validate_memory(srcMemory, count, sOffset, /*read_write*/ false); + if (status != hipSuccess) { + return status; + } + } + if (dstMemory != nullptr) { + hipError_t status = ihipMemcpy_validate_memory(dstMemory, count, dOffset, /*read_write*/ true); + if (status != hipSuccess) { + return status; + } + } + return hipSuccess; } diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index ca3966ba66..fb630cfeed 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -1797,7 +1797,16 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { amd::Memory* srcMemory = getMemoryObject(src_, sOffset); size_t dOffset = 0; amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); - hip::MemcpyType memType = ihipGetMemcpyType(src_, dst_, kind_); + + hip::MemcpyType memType = hipHostToHost; + if (srcMemory != nullptr && dstMemory == nullptr) { + memType = ihipGetMemcpyType(srcMemory, dst_); + } else if (srcMemory == nullptr && dstMemory != nullptr) { + memType = ihipGetMemcpyType(src_, dstMemory); + } else if (srcMemory != nullptr && dstMemory != nullptr) { + memType = ihipGetMemcpyType(srcMemory, dstMemory, kind_); + } + switch (memType) { case hipCopyBuffer: // D2H/H2D source/dst is pinned memory @@ -1855,8 +1864,24 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { if (!AMD_DIRECT_DISPATCH) { WorkerThreadLock_.lock(); } - status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *stream); - hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_); + + hip::MemcpyType type; + size_t dOffset, sOffset; + amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); + amd::Memory* srcMemory = getMemoryObject(src_, sOffset); + + if (dstMemory != nullptr && srcMemory != nullptr) { + status = ihipMemcpyCommand(command, dstMemory, srcMemory, count_, kind_, *stream, dOffset, + sOffset); + type = ihipGetMemcpyType(srcMemory, dstMemory, kind_); + } else if (dstMemory == nullptr && srcMemory != nullptr) { + status = ihipMemcpyCommand(command, dst_, srcMemory, count_, kind_, *stream, sOffset); + type = ihipGetMemcpyType(srcMemory, dst_); + } else if (dstMemory != nullptr && srcMemory == nullptr) { + status = ihipMemcpyCommand(command, dstMemory, src_, count_, kind_, *stream, dOffset); + type = ihipGetMemcpyType(src_, dstMemory); + } + if (type == hipCopyBuffer) { amd::CopyMemoryCommand* cpycmd = reinterpret_cast(command); amd::CopyMetadata copyMetadata = cpycmd->copyMetadata(); @@ -2005,7 +2030,18 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { } virtual bool GraphCaptureEnabled() override { if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) { - hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_); + hip::MemcpyType type; + + size_t dOffset, sOffset; + amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); + amd::Memory* srcMemory = getMemoryObject(src_, sOffset); + + // The case below is only interested in hipCopyBuffer, + // which is only valid for device to device copies. + if (dstMemory != nullptr && srcMemory != nullptr) { + type = ihipGetMemcpyType(srcMemory, dstMemory, kind_); + } + switch (type) { case hipCopyBuffer: return true; @@ -2052,7 +2088,21 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D { if (status != hipSuccess) { return status; } - status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *stream); + + size_t devOffset, dOffset; + amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset); + amd::Memory* dstMemory = getMemoryObject(dst_, dOffset); + + if (devMemory == nullptr) { + return hipErrorInvalidValue; + } + + if (dstMemory != nullptr) { + status = ihipMemcpyCommand(command, dstMemory, devMemory, count_, kind_, *stream, dOffset, devOffset); + } else { + status = ihipMemcpyCommand(command, dst_, devMemory, count_, kind_, *stream, devOffset); + } + if (status != hipSuccess) { return status; } @@ -2147,7 +2197,21 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D { if (status != hipSuccess) { return status; } - status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *stream); + + size_t devOffset, sOffset; + amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset); + amd::Memory* srcMemory = getMemoryObject(src_, sOffset); + + if (devMemory == nullptr) { + return hipErrorInvalidValue; + } + + if (srcMemory != nullptr) { + status = ihipMemcpyCommand(command, devMemory, srcMemory, count_, kind_, *stream, devOffset, sOffset); + } else { + status = ihipMemcpyCommand(command, devMemory, src_, count_, kind_, *stream, devOffset); + } + if (status != hipSuccess) { return status; } diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index f802c4b68c..241067288a 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -174,8 +174,7 @@ hipError_t hipExternalMemoryGetMappedBuffer(void** devPtr, hipExternalMemory_t e auto buf = reinterpret_cast(extMem); // Validate bounds - if (bufferDesc->size > buf->getSize() || - bufferDesc->offset > buf->getSize() - bufferDesc->size) { + if (bufferDesc->size > buf->getSize() || bufferDesc->offset > buf->getSize() - bufferDesc->size) { HIP_RETURN(hipErrorInvalidValue); } @@ -461,188 +460,194 @@ bool IsHtoHMemcpyValid(void* dst, const void* src, hipMemcpyKind kind) { } // ================================================================================================ -hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { - if (dst == nullptr || src == nullptr) { - return hipErrorInvalidValue; - } - if (static_cast(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) { - return hipErrorInvalidMemcpyDirection; - } - size_t sOffset = 0; - amd::Memory* srcMemory = getMemoryObject(src, sOffset); - size_t dOffset = 0; - amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - - if (srcMemory != nullptr) { - // Validate Mem Access in case of VMM Memory - if (!srcMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], false)) { - return hipErrorUnknown; - } - - // If the mem object is a VMM sub buffer (subbuffer has parent set), - // then use parent's size for validation. - if (srcMemory->parent() && (srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - srcMemory = srcMemory->parent(); - } - - // Size validation - if (sizeBytes > (srcMemory->getSize() - sOffset)) { - return hipErrorInvalidValue; - } +hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset, + bool read_write) { + // Validate Mem Access in case of VMM Memory + if (!memObj->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], read_write)) { + return hipErrorUnknown; } - if (dstMemory != nullptr) { - // Validate Mem Access in case of VMM Memory - if (!dstMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], true)) { - return hipErrorUnknown; - } - - // If the mem object is a VMM sub buffer (subbuffer has parent set), - // then use parent's size for validation. - if (dstMemory->parent() && (dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - dstMemory = dstMemory->parent(); - } - - // Size validation - if (sizeBytes > (dstMemory->getSize() - dOffset)) { - return hipErrorInvalidValue; - } + // If the mem object is a VMM sub buffer (subbuffer has parent set), + // then use parent's size for validation. + if (memObj->parent() && (memObj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + memObj = memObj->parent(); } - // If src and dst ptr are null then kind must be either h2h or def. - if (!IsHtoHMemcpyValid(dst, src, kind)) { + // Size validation + if (sizeBytes > (memObj->getSize() - offset)) { return hipErrorInvalidValue; } return hipSuccess; } -hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind) { - size_t sOffset = 0; - amd::Memory* srcMemory = getMemoryObject(src, sOffset); - size_t dOffset = 0; - amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - hip::MemcpyType type; - if (srcMemory == nullptr && dstMemory == nullptr) { - type = hipHostToHost; - } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - type = hipWriteBuffer; - } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { - type = hipReadBuffer; - } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { - // Check if the queue device doesn't match the device on any memory object. - // And any of them are not host allocation. - // Hence it's a P2P transfer, because the app has requested access to another GPU - if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) && - ((srcMemory->getContext().devices().size() == 1) && - (dstMemory->getContext().devices().size() == 1))) { - type = hipCopyBufferP2P; - } else if (kind == hipMemcpyDeviceToDeviceNoCU) { - type = hipCopyBufferSDMA; - } else { - type = hipCopyBuffer; - } - } - return type; -} - // ================================================================================================ -hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, hip::Stream& stream, bool isAsync) { - amd::Command::EventWaitList waitList; - size_t sOffset = 0; - amd::Memory* srcMemory = getMemoryObject(src, sOffset); - size_t dOffset = 0; - amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - amd::Device* queueDevice = &stream.device(); - amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE); - hip::MemcpyType type = ihipGetMemcpyType(src, dst, kind); - hip::Stream* pStream = &stream; - switch (type) { - case hipWriteBuffer: - if (queueDevice != dstMemory->GetDeviceById() && - !(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); - amd::Command* cmd = stream.getLastQueuedCommand(true); - if (cmd != nullptr) { - waitList.push_back(cmd); - } - } - command = new amd::WriteMemoryCommand(*pStream, CL_COMMAND_WRITE_BUFFER, waitList, - *dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0, - copyMetadata); - break; - case hipReadBuffer: - if (queueDevice != srcMemory->GetDeviceById() && - !(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); - amd::Command* cmd = stream.getLastQueuedCommand(true); - if (cmd != nullptr) { - waitList.push_back(cmd); - } - } - command = new amd::ReadMemoryCommand(*pStream, CL_COMMAND_READ_BUFFER, waitList, - *srcMemory->asBuffer(), sOffset, sizeBytes, dst, 0, 0, - copyMetadata); - break; - case hipCopyBufferP2P: - command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList, - *srcMemory->asBuffer(), *dstMemory->asBuffer(), - sOffset, dOffset, sizeBytes); - if (command == nullptr) { - return hipErrorOutOfMemory; - } - // Make sure runtime has valid memory for the command execution. P2P access - // requires page table mapping on the current device to another GPU memory - if (!static_cast(command)->validateMemory()) { - delete command; - return hipErrorInvalidValue; - } - break; - case hipCopyBufferSDMA: - copyMetadata.copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA; - case hipCopyBuffer: - if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) && - queueDevice != srcMemory->GetDeviceById()) { - pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); - amd::Command* cmd = stream.getLastQueuedCommand(true); - if (cmd != nullptr) { - waitList.push_back(cmd); - } - } else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) { - // Scenarios such as DtoH where dst is pinned memory - if ((queueDevice != srcMemory->GetDeviceById()) && - (dstMemory->getContext().devices().size() != 1) && - !(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); - amd::Command* cmd = stream.getLastQueuedCommand(true); - if (cmd != nullptr) { - waitList.push_back(cmd); - } - // Scenarios such as HtoD where src is pinned memory - } else if ((queueDevice != dstMemory->GetDeviceById()) && - (srcMemory->getContext().devices().size() != 1) && - !(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { - pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); - amd::Command* cmd = stream.getLastQueuedCommand(true); - if (cmd != nullptr) { - waitList.push_back(cmd); - } - } - } - command = new amd::CopyMemoryCommand(*pStream, CL_COMMAND_COPY_BUFFER, waitList, - *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, - dOffset, sizeBytes, copyMetadata); - break; - } - if (command == nullptr) { - return hipErrorOutOfMemory; - } - if (waitList.size() > 0) { - waitList[0]->release(); - } +hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes, + size_t dstOffset, size_t srcOffset) { + hipError_t status; + + status = ihipMemcpy_validate_memory(srcMemory, sizeBytes, srcOffset, /*read_write*/ false); + if (status != hipSuccess) return status; + status = ihipMemcpy_validate_memory(dstMemory, sizeBytes, dstOffset, /*read_write*/ true); + if (status != hipSuccess) return status; + return hipSuccess; } +// ================================================================================================ +hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind) { + if ((src->GetDeviceById() != dst->GetDeviceById()) && + ((src->getContext().devices().size() == 1) && (dst->getContext().devices().size() == 1))) { + return hipCopyBufferP2P; + } else if (kind == hipMemcpyDeviceToDeviceNoCU) { + return hipCopyBufferSDMA; + } + return hipCopyBuffer; +} + +// ================================================================================================ +// Helper class to manage common memcpy command state and cleanup +class MemcpyCommandHelper { + public: + MemcpyCommandHelper(hip::Stream& stream, bool isAsync) + : waitList_(), + copyMetadata_(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE), + pStream_(&stream), + queueDevice_(&stream.device()) {} + + ~MemcpyCommandHelper() { + // Cleanup: release waitList command if present + if (waitList_.size() > 0) { + waitList_[0]->release(); + } + } + + // Non-copyable, non-movable + MemcpyCommandHelper(const MemcpyCommandHelper&) = delete; + MemcpyCommandHelper& operator=(const MemcpyCommandHelper&) = delete; + + amd::Command::EventWaitList& waitList() { return waitList_; } + amd::CopyMetadata& copyMetadata() { return copyMetadata_; } + hip::Stream*& pStream() { return pStream_; } + amd::Device* queueDevice() const { return queueDevice_; } + + // Helper to add wait command from stream + void addWaitCommand(hip::Stream& stream) { + amd::Command* cmd = stream.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList_.push_back(cmd); + } + } + + // Helper to switch stream and add wait command + void switchStreamAndWait(hip::Stream& originalStream, amd::Context& context) { + pStream_ = hip::getNullStream(context); + addWaitCommand(originalStream); + } + + // Common error handling for command creation + static hipError_t checkCommand(amd::Command* command) { + if (command == nullptr) { + return hipErrorOutOfMemory; + } + return hipSuccess; + } + + private: + amd::Command::EventWaitList waitList_; + amd::CopyMetadata copyMetadata_; + hip::Stream* pStream_; + amd::Device* queueDevice_; +}; + +// ================================================================================================ +hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory, + size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, + size_t dstOffset, bool isAsync) { + MemcpyCommandHelper helper(stream, isAsync); + + if (&stream.device() != dstMemory->GetDeviceById() && + !(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context()); + } + command = new amd::WriteMemoryCommand( + *helper.pStream(), CL_COMMAND_WRITE_BUFFER, helper.waitList(), *dstMemory->asBuffer(), + dstOffset, sizeBytes, srcMemory, 0, 0, helper.copyMetadata()); + return MemcpyCommandHelper::checkCommand(command); +} + +// ================================================================================================ +hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory, + size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, + size_t srcOffset, bool isAsync) { + MemcpyCommandHelper helper(stream, isAsync); + + if (helper.queueDevice() != srcMemory->GetDeviceById() && + !(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context()); + } + command = new amd::ReadMemoryCommand(*helper.pStream(), CL_COMMAND_READ_BUFFER, helper.waitList(), + *srcMemory->asBuffer(), srcOffset, sizeBytes, + dstMemory, 0, 0, helper.copyMetadata()); + return MemcpyCommandHelper::checkCommand(command); +} + +// ================================================================================================ +hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory, + size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream, + size_t dstOffset, size_t srcOffset, bool isAsync) { + MemcpyCommandHelper helper(stream, isAsync); + + hip::MemcpyType type = ihipGetMemcpyType(srcMemory, dstMemory, kind); + switch (type) { + case hipCopyBufferP2P: + command = new amd::CopyMemoryP2PCommand( + stream, CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(), + *dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes); + { + hipError_t status = MemcpyCommandHelper::checkCommand(command); + if (status != hipSuccess) { + return status; + } + // Make sure runtime has valid memory for the command execution. P2P access + // requires page table mapping on the current device to another GPU memory + if (!static_cast(command)->validateMemory()) { + delete command; + return hipErrorInvalidValue; + } + } + break; + case hipCopyBufferSDMA: + helper.copyMetadata().copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA; + case hipCopyBuffer: + if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) && + helper.queueDevice() != srcMemory->GetDeviceById()) { + helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context()); + } else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) { + // Scenarios such as DtoH where dst is pinned memory + if ((helper.queueDevice() != srcMemory->GetDeviceById()) && + (dstMemory->getContext().devices().size() != 1) && + !(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context()); + // Scenarios such as HtoD where src is pinned memory + } else if ((helper.queueDevice() != dstMemory->GetDeviceById()) && + (srcMemory->getContext().devices().size() != 1) && + !(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context()); + } + } + command = new amd::CopyMemoryCommand( + *helper.pStream(), CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(), + *dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes, + helper.copyMetadata()); + break; + case hipHostToHost: + assert(false && "Unreachable case"); + break; + } + return MemcpyCommandHelper::checkCommand(command); +} + +// ================================================================================================ bool IsHtoHMemcpy(void* dst, const void* src) { size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); @@ -668,43 +673,79 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin // Skip if nothing needs writing. return hipSuccess; } - status = ihipMemcpy_validate(dst, src, sizeBytes, kind); - if (status != hipSuccess) { - return status; + if (dst == nullptr || src == nullptr) { + return hipErrorInvalidValue; + } + if (static_cast(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) { + return hipErrorInvalidMemcpyDirection; } if (src == dst && kind == hipMemcpyDefault) { return hipSuccess; } + size_t sOffset = 0; - amd::Memory* srcMemory = getMemoryObject(src, sOffset); + amd::Memory* srcDeviceMemory = getMemoryObject(src, sOffset); size_t dOffset = 0; - amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - - hipMemoryType srcMemoryType = getMemoryType(srcMemory); - hipMemoryType dstMemoryType = getMemoryType(dstMemory); - - if (srcMemory == nullptr && dstMemory == nullptr) { - ihipHtoHMemcpy(dst, src, sizeBytes, stream); - return hipSuccess; - } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) || - ((srcMemory != nullptr) && (dstMemory == nullptr))) { - // Unpinned copy wait behavior is enforced in the lower copy layers so skip - // wait at top level except for MT path - isHostAsync &= AMD_DIRECT_DISPATCH ? true : false; - } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) { - // Device to Device copies do not need to host side synchronization. - if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) && - (!srcMemory->getUserData().sync_mem_ops_ || !dstMemory->getUserData().sync_mem_ops_)) { - isHostAsync = true; - } - // Any Host to any Host need host side synchronization. - if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) { - isHostAsync = false; - } + amd::Memory* dstDeviceMemory = getMemoryObject(dst, dOffset); + + // Handle kind vs memobject miss matches + if (kind == hipMemcpyDeviceToHost && srcDeviceMemory == nullptr) { + return hipErrorInvalidValue; + } + if (kind == hipMemcpyHostToDevice && dstDeviceMemory == nullptr) { + return hipErrorInvalidValue; } amd::Command* command = nullptr; - status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isHostAsync); + if (srcDeviceMemory == nullptr && dstDeviceMemory == nullptr) { + ihipHtoHMemcpy(dst, src, sizeBytes, stream); + return hipSuccess; + } else if (dstDeviceMemory == nullptr || srcDeviceMemory == nullptr) { + // Unpinned copy wait behavior is enforced in the lower copy layers so skip + // wait at top level except for MT path + isHostAsync &= AMD_DIRECT_DISPATCH ? true : false; + if (dstDeviceMemory != nullptr) { + status = ihipMemcpy_validate_memory(dstDeviceMemory, sizeBytes, dOffset, /*read_write*/ true); + if (status != hipSuccess) { + return status; + } + status = + ihipMemcpyCommand(command, dstDeviceMemory, src, sizeBytes, kind, stream, dOffset, isHostAsync); + } else { + status = + ihipMemcpy_validate_memory(srcDeviceMemory, sizeBytes, sOffset, /*read_write*/ false); + if (status != hipSuccess) { + return status; + } + status = + ihipMemcpyCommand(command, dst, srcDeviceMemory, sizeBytes, kind, stream, sOffset, isHostAsync); + } + } else { + // Both are AMD memory + hipMemoryType srcMemoryType = getMemoryType(srcDeviceMemory); + hipMemoryType dstMemoryType = getMemoryType(dstDeviceMemory); + + status = ihipMemcpy_validate(dstDeviceMemory, srcDeviceMemory, sizeBytes, dOffset, sOffset); + if (status != hipSuccess) { + return status; + } + + if (srcDeviceMemory->GetDeviceById() == dstDeviceMemory->GetDeviceById()) { + // Device to Device copies do not need to host side synchronization. + if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) && + (!srcDeviceMemory->getUserData().sync_mem_ops_ || + !dstDeviceMemory->getUserData().sync_mem_ops_)) { + isHostAsync = true; + } + // Any Host to any Host need host side synchronization. + if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) { + isHostAsync = false; + } + } + status = ihipMemcpyCommand(command, dstDeviceMemory, srcDeviceMemory, sizeBytes, kind, stream, + dOffset, sOffset, isHostAsync); + } + if (status != hipSuccess) { return status; } @@ -712,7 +753,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin if (!isHostAsync) { command->queue()->finishCommand(command); } else if (!isGPUAsync) { - hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); + hip::Stream* pStream = hip::getNullStream(dstDeviceMemory->GetDeviceById()->context()); amd::Command::EventWaitList waitList; waitList.push_back(command); amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList); @@ -3684,7 +3725,7 @@ hipError_t ihipPointerGetAttributes(void* data, hipPointer_attribute attribute, case HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE: { if (memObj) { if (getMemoryType(memObj) == hipMemoryTypeHost) { - // host pointer, pinned or registered memory + // host pointer, pinned or registered memory *reinterpret_cast(data) = 0; } else if ((memObj->getMemFlags() & kManagedAlloc) == kManagedAlloc) { // managed allocation