SWDEV-566854 - Improve memory object handling (#1939)

* Improve memory object handling for memcpy

* update

* Pass offsets and make hip_graph changes

* Update projects/clr/hipamd/src/hip_memory.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Remove unnecessary command overload

* Update based on feedback

* Fix failing hipGraphTests

* Fix graph bugs

* Fix failing memcpy tests

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
AidanBeltonS
2026-01-05 18:05:56 +00:00
committad av GitHub
förälder 11d9472e5f
incheckning 39d8432893
4 ändrade filer med 354 tillägg och 213 borttagningar
+27 -3
Visa fil
@@ -34,8 +34,23 @@ hipError_t hipMemcpy2DValidateBuffer(const void* buf, size_t pitch, size_t width
hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
hipMemcpyKind kind, hip::Stream& stream, bool isAsync = true);
hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset,
bool read_write);
hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes,
size_t dstOffset, size_t srcOffset);
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory,
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
size_t dstOffset, bool isAsync = true);
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory,
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
size_t srcOffset, bool isAsync = true);
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory,
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
size_t dstOffset, size_t srcOffset, bool isAsync = true);
void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream);
@@ -123,5 +138,14 @@ hipError_t ihipMemcpyAtoHValidate(hipArray_t srcArray, void* dstHost, amd::Coord
hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams);
hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind);
constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst) {
return hipHostToHost;
}
constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, amd::Memory* dst) {
return hipWriteBuffer;
}
constexpr hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, void* dst) {
return hipReadBuffer;
}
hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind);
} // namespace hip
+15 -3
Visa fil
@@ -77,9 +77,8 @@ amd::Monitor GraphNode::WorkerThreadLock_{};
hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t count,
hipMemcpyKind kind) {
hipError_t status = ihipMemcpy_validate(dst, src, count, kind);
if (status != hipSuccess) {
return status;
if (dst == nullptr || src == nullptr) {
return hipErrorInvalidValue;
}
size_t sOffset = 0;
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
@@ -96,6 +95,19 @@ hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t
}
}
if (srcMemory != nullptr) {
hipError_t status = ihipMemcpy_validate_memory(srcMemory, count, sOffset, /*read_write*/ false);
if (status != hipSuccess) {
return status;
}
}
if (dstMemory != nullptr) {
hipError_t status = ihipMemcpy_validate_memory(dstMemory, count, dOffset, /*read_write*/ true);
if (status != hipSuccess) {
return status;
}
}
return hipSuccess;
}
+70 -6
Visa fil
@@ -1797,7 +1797,16 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
size_t dOffset = 0;
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
hip::MemcpyType memType = ihipGetMemcpyType(src_, dst_, kind_);
hip::MemcpyType memType = hipHostToHost;
if (srcMemory != nullptr && dstMemory == nullptr) {
memType = ihipGetMemcpyType(srcMemory, dst_);
} else if (srcMemory == nullptr && dstMemory != nullptr) {
memType = ihipGetMemcpyType(src_, dstMemory);
} else if (srcMemory != nullptr && dstMemory != nullptr) {
memType = ihipGetMemcpyType(srcMemory, dstMemory, kind_);
}
switch (memType) {
case hipCopyBuffer:
// D2H/H2D source/dst is pinned memory
@@ -1855,8 +1864,24 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
if (!AMD_DIRECT_DISPATCH) {
WorkerThreadLock_.lock();
}
status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *stream);
hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_);
hip::MemcpyType type;
size_t dOffset, sOffset;
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
if (dstMemory != nullptr && srcMemory != nullptr) {
status = ihipMemcpyCommand(command, dstMemory, srcMemory, count_, kind_, *stream, dOffset,
sOffset);
type = ihipGetMemcpyType(srcMemory, dstMemory, kind_);
} else if (dstMemory == nullptr && srcMemory != nullptr) {
status = ihipMemcpyCommand(command, dst_, srcMemory, count_, kind_, *stream, sOffset);
type = ihipGetMemcpyType(srcMemory, dst_);
} else if (dstMemory != nullptr && srcMemory == nullptr) {
status = ihipMemcpyCommand(command, dstMemory, src_, count_, kind_, *stream, dOffset);
type = ihipGetMemcpyType(src_, dstMemory);
}
if (type == hipCopyBuffer) {
amd::CopyMemoryCommand* cpycmd = reinterpret_cast<amd::CopyMemoryCommand*>(command);
amd::CopyMetadata copyMetadata = cpycmd->copyMetadata();
@@ -2005,7 +2030,18 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
}
virtual bool GraphCaptureEnabled() override {
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_);
hip::MemcpyType type;
size_t dOffset, sOffset;
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
// The case below is only interested in hipCopyBuffer,
// which is only valid for device to device copies.
if (dstMemory != nullptr && srcMemory != nullptr) {
type = ihipGetMemcpyType(srcMemory, dstMemory, kind_);
}
switch (type) {
case hipCopyBuffer:
return true;
@@ -2052,7 +2088,21 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D {
if (status != hipSuccess) {
return status;
}
status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *stream);
size_t devOffset, dOffset;
amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset);
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
if (devMemory == nullptr) {
return hipErrorInvalidValue;
}
if (dstMemory != nullptr) {
status = ihipMemcpyCommand(command, dstMemory, devMemory, count_, kind_, *stream, dOffset, devOffset);
} else {
status = ihipMemcpyCommand(command, dst_, devMemory, count_, kind_, *stream, devOffset);
}
if (status != hipSuccess) {
return status;
}
@@ -2147,7 +2197,21 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
if (status != hipSuccess) {
return status;
}
status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *stream);
size_t devOffset, sOffset;
amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset);
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
if (devMemory == nullptr) {
return hipErrorInvalidValue;
}
if (srcMemory != nullptr) {
status = ihipMemcpyCommand(command, devMemory, srcMemory, count_, kind_, *stream, devOffset, sOffset);
} else {
status = ihipMemcpyCommand(command, devMemory, src_, count_, kind_, *stream, devOffset);
}
if (status != hipSuccess) {
return status;
}
+242 -201
Visa fil
@@ -174,8 +174,7 @@ hipError_t hipExternalMemoryGetMappedBuffer(void** devPtr, hipExternalMemory_t e
auto buf = reinterpret_cast<amd::ExternalBuffer*>(extMem);
// Validate bounds
if (bufferDesc->size > buf->getSize() ||
bufferDesc->offset > buf->getSize() - bufferDesc->size) {
if (bufferDesc->size > buf->getSize() || bufferDesc->offset > buf->getSize() - bufferDesc->size) {
HIP_RETURN(hipErrorInvalidValue);
}
@@ -461,188 +460,194 @@ bool IsHtoHMemcpyValid(void* dst, const void* src, hipMemcpyKind kind) {
}
// ================================================================================================
hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) {
if (dst == nullptr || src == nullptr) {
return hipErrorInvalidValue;
}
if (static_cast<uint32_t>(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) {
return hipErrorInvalidMemcpyDirection;
}
size_t sOffset = 0;
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
size_t dOffset = 0;
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
if (srcMemory != nullptr) {
// Validate Mem Access in case of VMM Memory
if (!srcMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], false)) {
return hipErrorUnknown;
}
// If the mem object is a VMM sub buffer (subbuffer has parent set),
// then use parent's size for validation.
if (srcMemory->parent() && (srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
srcMemory = srcMemory->parent();
}
// Size validation
if (sizeBytes > (srcMemory->getSize() - sOffset)) {
return hipErrorInvalidValue;
}
hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset,
bool read_write) {
// Validate Mem Access in case of VMM Memory
if (!memObj->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], read_write)) {
return hipErrorUnknown;
}
if (dstMemory != nullptr) {
// Validate Mem Access in case of VMM Memory
if (!dstMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], true)) {
return hipErrorUnknown;
}
// If the mem object is a VMM sub buffer (subbuffer has parent set),
// then use parent's size for validation.
if (dstMemory->parent() && (dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
dstMemory = dstMemory->parent();
}
// Size validation
if (sizeBytes > (dstMemory->getSize() - dOffset)) {
return hipErrorInvalidValue;
}
// If the mem object is a VMM sub buffer (subbuffer has parent set),
// then use parent's size for validation.
if (memObj->parent() && (memObj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
memObj = memObj->parent();
}
// If src and dst ptr are null then kind must be either h2h or def.
if (!IsHtoHMemcpyValid(dst, src, kind)) {
// Size validation
if (sizeBytes > (memObj->getSize() - offset)) {
return hipErrorInvalidValue;
}
return hipSuccess;
}
hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind) {
size_t sOffset = 0;
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
size_t dOffset = 0;
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
hip::MemcpyType type;
if (srcMemory == nullptr && dstMemory == nullptr) {
type = hipHostToHost;
} else if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
type = hipWriteBuffer;
} else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
type = hipReadBuffer;
} else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
// Check if the queue device doesn't match the device on any memory object.
// And any of them are not host allocation.
// Hence it's a P2P transfer, because the app has requested access to another GPU
if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) &&
((srcMemory->getContext().devices().size() == 1) &&
(dstMemory->getContext().devices().size() == 1))) {
type = hipCopyBufferP2P;
} else if (kind == hipMemcpyDeviceToDeviceNoCU) {
type = hipCopyBufferSDMA;
} else {
type = hipCopyBuffer;
}
}
return type;
}
// ================================================================================================
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
hipMemcpyKind kind, hip::Stream& stream, bool isAsync) {
amd::Command::EventWaitList waitList;
size_t sOffset = 0;
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
size_t dOffset = 0;
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
amd::Device* queueDevice = &stream.device();
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
hip::MemcpyType type = ihipGetMemcpyType(src, dst, kind);
hip::Stream* pStream = &stream;
switch (type) {
case hipWriteBuffer:
if (queueDevice != dstMemory->GetDeviceById() &&
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
amd::Command* cmd = stream.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
}
command = new amd::WriteMemoryCommand(*pStream, CL_COMMAND_WRITE_BUFFER, waitList,
*dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0,
copyMetadata);
break;
case hipReadBuffer:
if (queueDevice != srcMemory->GetDeviceById() &&
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
amd::Command* cmd = stream.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
}
command = new amd::ReadMemoryCommand(*pStream, CL_COMMAND_READ_BUFFER, waitList,
*srcMemory->asBuffer(), sOffset, sizeBytes, dst, 0, 0,
copyMetadata);
break;
case hipCopyBufferP2P:
command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList,
*srcMemory->asBuffer(), *dstMemory->asBuffer(),
sOffset, dOffset, sizeBytes);
if (command == nullptr) {
return hipErrorOutOfMemory;
}
// Make sure runtime has valid memory for the command execution. P2P access
// requires page table mapping on the current device to another GPU memory
if (!static_cast<amd::CopyMemoryP2PCommand*>(command)->validateMemory()) {
delete command;
return hipErrorInvalidValue;
}
break;
case hipCopyBufferSDMA:
copyMetadata.copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA;
case hipCopyBuffer:
if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) &&
queueDevice != srcMemory->GetDeviceById()) {
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
amd::Command* cmd = stream.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
} else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) {
// Scenarios such as DtoH where dst is pinned memory
if ((queueDevice != srcMemory->GetDeviceById()) &&
(dstMemory->getContext().devices().size() != 1) &&
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
amd::Command* cmd = stream.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
// Scenarios such as HtoD where src is pinned memory
} else if ((queueDevice != dstMemory->GetDeviceById()) &&
(srcMemory->getContext().devices().size() != 1) &&
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
amd::Command* cmd = stream.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
}
}
command = new amd::CopyMemoryCommand(*pStream, CL_COMMAND_COPY_BUFFER, waitList,
*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset,
dOffset, sizeBytes, copyMetadata);
break;
}
if (command == nullptr) {
return hipErrorOutOfMemory;
}
if (waitList.size() > 0) {
waitList[0]->release();
}
hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes,
size_t dstOffset, size_t srcOffset) {
hipError_t status;
status = ihipMemcpy_validate_memory(srcMemory, sizeBytes, srcOffset, /*read_write*/ false);
if (status != hipSuccess) return status;
status = ihipMemcpy_validate_memory(dstMemory, sizeBytes, dstOffset, /*read_write*/ true);
if (status != hipSuccess) return status;
return hipSuccess;
}
// ================================================================================================
hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind) {
if ((src->GetDeviceById() != dst->GetDeviceById()) &&
((src->getContext().devices().size() == 1) && (dst->getContext().devices().size() == 1))) {
return hipCopyBufferP2P;
} else if (kind == hipMemcpyDeviceToDeviceNoCU) {
return hipCopyBufferSDMA;
}
return hipCopyBuffer;
}
// ================================================================================================
// Helper class to manage common memcpy command state and cleanup
class MemcpyCommandHelper {
public:
MemcpyCommandHelper(hip::Stream& stream, bool isAsync)
: waitList_(),
copyMetadata_(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE),
pStream_(&stream),
queueDevice_(&stream.device()) {}
~MemcpyCommandHelper() {
// Cleanup: release waitList command if present
if (waitList_.size() > 0) {
waitList_[0]->release();
}
}
// Non-copyable, non-movable
MemcpyCommandHelper(const MemcpyCommandHelper&) = delete;
MemcpyCommandHelper& operator=(const MemcpyCommandHelper&) = delete;
amd::Command::EventWaitList& waitList() { return waitList_; }
amd::CopyMetadata& copyMetadata() { return copyMetadata_; }
hip::Stream*& pStream() { return pStream_; }
amd::Device* queueDevice() const { return queueDevice_; }
// Helper to add wait command from stream
void addWaitCommand(hip::Stream& stream) {
amd::Command* cmd = stream.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList_.push_back(cmd);
}
}
// Helper to switch stream and add wait command
void switchStreamAndWait(hip::Stream& originalStream, amd::Context& context) {
pStream_ = hip::getNullStream(context);
addWaitCommand(originalStream);
}
// Common error handling for command creation
static hipError_t checkCommand(amd::Command* command) {
if (command == nullptr) {
return hipErrorOutOfMemory;
}
return hipSuccess;
}
private:
amd::Command::EventWaitList waitList_;
amd::CopyMetadata copyMetadata_;
hip::Stream* pStream_;
amd::Device* queueDevice_;
};
// ================================================================================================
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory,
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
size_t dstOffset, bool isAsync) {
MemcpyCommandHelper helper(stream, isAsync);
if (&stream.device() != dstMemory->GetDeviceById() &&
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context());
}
command = new amd::WriteMemoryCommand(
*helper.pStream(), CL_COMMAND_WRITE_BUFFER, helper.waitList(), *dstMemory->asBuffer(),
dstOffset, sizeBytes, srcMemory, 0, 0, helper.copyMetadata());
return MemcpyCommandHelper::checkCommand(command);
}
// ================================================================================================
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory,
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
size_t srcOffset, bool isAsync) {
MemcpyCommandHelper helper(stream, isAsync);
if (helper.queueDevice() != srcMemory->GetDeviceById() &&
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context());
}
command = new amd::ReadMemoryCommand(*helper.pStream(), CL_COMMAND_READ_BUFFER, helper.waitList(),
*srcMemory->asBuffer(), srcOffset, sizeBytes,
dstMemory, 0, 0, helper.copyMetadata());
return MemcpyCommandHelper::checkCommand(command);
}
// ================================================================================================
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory,
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
size_t dstOffset, size_t srcOffset, bool isAsync) {
MemcpyCommandHelper helper(stream, isAsync);
hip::MemcpyType type = ihipGetMemcpyType(srcMemory, dstMemory, kind);
switch (type) {
case hipCopyBufferP2P:
command = new amd::CopyMemoryP2PCommand(
stream, CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(),
*dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes);
{
hipError_t status = MemcpyCommandHelper::checkCommand(command);
if (status != hipSuccess) {
return status;
}
// Make sure runtime has valid memory for the command execution. P2P access
// requires page table mapping on the current device to another GPU memory
if (!static_cast<amd::CopyMemoryP2PCommand*>(command)->validateMemory()) {
delete command;
return hipErrorInvalidValue;
}
}
break;
case hipCopyBufferSDMA:
helper.copyMetadata().copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA;
case hipCopyBuffer:
if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) &&
helper.queueDevice() != srcMemory->GetDeviceById()) {
helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context());
} else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) {
// Scenarios such as DtoH where dst is pinned memory
if ((helper.queueDevice() != srcMemory->GetDeviceById()) &&
(dstMemory->getContext().devices().size() != 1) &&
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context());
// Scenarios such as HtoD where src is pinned memory
} else if ((helper.queueDevice() != dstMemory->GetDeviceById()) &&
(srcMemory->getContext().devices().size() != 1) &&
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context());
}
}
command = new amd::CopyMemoryCommand(
*helper.pStream(), CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(),
*dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes,
helper.copyMetadata());
break;
case hipHostToHost:
assert(false && "Unreachable case");
break;
}
return MemcpyCommandHelper::checkCommand(command);
}
// ================================================================================================
bool IsHtoHMemcpy(void* dst, const void* src) {
size_t sOffset = 0;
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
@@ -668,43 +673,79 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
// Skip if nothing needs writing.
return hipSuccess;
}
status = ihipMemcpy_validate(dst, src, sizeBytes, kind);
if (status != hipSuccess) {
return status;
if (dst == nullptr || src == nullptr) {
return hipErrorInvalidValue;
}
if (static_cast<uint32_t>(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) {
return hipErrorInvalidMemcpyDirection;
}
if (src == dst && kind == hipMemcpyDefault) {
return hipSuccess;
}
size_t sOffset = 0;
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
amd::Memory* srcDeviceMemory = getMemoryObject(src, sOffset);
size_t dOffset = 0;
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
hipMemoryType srcMemoryType = getMemoryType(srcMemory);
hipMemoryType dstMemoryType = getMemoryType(dstMemory);
if (srcMemory == nullptr && dstMemory == nullptr) {
ihipHtoHMemcpy(dst, src, sizeBytes, stream);
return hipSuccess;
} else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
((srcMemory != nullptr) && (dstMemory == nullptr))) {
// Unpinned copy wait behavior is enforced in the lower copy layers so skip
// wait at top level except for MT path
isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
} else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
// Device to Device copies do not need to host side synchronization.
if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
(!srcMemory->getUserData().sync_mem_ops_ || !dstMemory->getUserData().sync_mem_ops_)) {
isHostAsync = true;
}
// Any Host to any Host need host side synchronization.
if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) {
isHostAsync = false;
}
amd::Memory* dstDeviceMemory = getMemoryObject(dst, dOffset);
// Handle kind vs memobject miss matches
if (kind == hipMemcpyDeviceToHost && srcDeviceMemory == nullptr) {
return hipErrorInvalidValue;
}
if (kind == hipMemcpyHostToDevice && dstDeviceMemory == nullptr) {
return hipErrorInvalidValue;
}
amd::Command* command = nullptr;
status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isHostAsync);
if (srcDeviceMemory == nullptr && dstDeviceMemory == nullptr) {
ihipHtoHMemcpy(dst, src, sizeBytes, stream);
return hipSuccess;
} else if (dstDeviceMemory == nullptr || srcDeviceMemory == nullptr) {
// Unpinned copy wait behavior is enforced in the lower copy layers so skip
// wait at top level except for MT path
isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
if (dstDeviceMemory != nullptr) {
status = ihipMemcpy_validate_memory(dstDeviceMemory, sizeBytes, dOffset, /*read_write*/ true);
if (status != hipSuccess) {
return status;
}
status =
ihipMemcpyCommand(command, dstDeviceMemory, src, sizeBytes, kind, stream, dOffset, isHostAsync);
} else {
status =
ihipMemcpy_validate_memory(srcDeviceMemory, sizeBytes, sOffset, /*read_write*/ false);
if (status != hipSuccess) {
return status;
}
status =
ihipMemcpyCommand(command, dst, srcDeviceMemory, sizeBytes, kind, stream, sOffset, isHostAsync);
}
} else {
// Both are AMD memory
hipMemoryType srcMemoryType = getMemoryType(srcDeviceMemory);
hipMemoryType dstMemoryType = getMemoryType(dstDeviceMemory);
status = ihipMemcpy_validate(dstDeviceMemory, srcDeviceMemory, sizeBytes, dOffset, sOffset);
if (status != hipSuccess) {
return status;
}
if (srcDeviceMemory->GetDeviceById() == dstDeviceMemory->GetDeviceById()) {
// Device to Device copies do not need to host side synchronization.
if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
(!srcDeviceMemory->getUserData().sync_mem_ops_ ||
!dstDeviceMemory->getUserData().sync_mem_ops_)) {
isHostAsync = true;
}
// Any Host to any Host need host side synchronization.
if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) {
isHostAsync = false;
}
}
status = ihipMemcpyCommand(command, dstDeviceMemory, srcDeviceMemory, sizeBytes, kind, stream,
dOffset, sOffset, isHostAsync);
}
if (status != hipSuccess) {
return status;
}
@@ -712,7 +753,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
if (!isHostAsync) {
command->queue()->finishCommand(command);
} else if (!isGPUAsync) {
hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
hip::Stream* pStream = hip::getNullStream(dstDeviceMemory->GetDeviceById()->context());
amd::Command::EventWaitList waitList;
waitList.push_back(command);
amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList);
@@ -3684,7 +3725,7 @@ hipError_t ihipPointerGetAttributes(void* data, hipPointer_attribute attribute,
case HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE: {
if (memObj) {
if (getMemoryType(memObj) == hipMemoryTypeHost) {
// host pointer, pinned or registered memory
// host pointer, pinned or registered memory
*reinterpret_cast<int*>(data) = 0;
} else if ((memObj->getMemFlags() & kManagedAlloc) == kManagedAlloc) {
// managed allocation