SWDEV-566854 - Improve memory object handling (#1939)
* Improve memory object handling for memcpy * update * Pass offsets and make hip_graph changes * Update projects/clr/hipamd/src/hip_memory.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Remove unnecessary command overload * Update based on feedback * Fix failing hipGraphTests * Fix graph bugs * Fix failing memcpy tests --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
committad av
GitHub
förälder
11d9472e5f
incheckning
39d8432893
@@ -34,8 +34,23 @@ hipError_t hipMemcpy2DValidateBuffer(const void* buf, size_t pitch, size_t width
|
||||
|
||||
hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
|
||||
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
|
||||
hipMemcpyKind kind, hip::Stream& stream, bool isAsync = true);
|
||||
hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset,
|
||||
bool read_write);
|
||||
|
||||
hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes,
|
||||
size_t dstOffset, size_t srcOffset);
|
||||
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory,
|
||||
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
|
||||
size_t dstOffset, bool isAsync = true);
|
||||
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory,
|
||||
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
|
||||
size_t srcOffset, bool isAsync = true);
|
||||
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory,
|
||||
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
|
||||
size_t dstOffset, size_t srcOffset, bool isAsync = true);
|
||||
|
||||
void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream);
|
||||
|
||||
@@ -123,5 +138,14 @@ hipError_t ihipMemcpyAtoHValidate(hipArray_t srcArray, void* dstHost, amd::Coord
|
||||
|
||||
hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams);
|
||||
|
||||
hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind);
|
||||
constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst) {
|
||||
return hipHostToHost;
|
||||
}
|
||||
constexpr hip::MemcpyType ihipGetMemcpyType(const void* src, amd::Memory* dst) {
|
||||
return hipWriteBuffer;
|
||||
}
|
||||
constexpr hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, void* dst) {
|
||||
return hipReadBuffer;
|
||||
}
|
||||
hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind);
|
||||
} // namespace hip
|
||||
|
||||
@@ -77,9 +77,8 @@ amd::Monitor GraphNode::WorkerThreadLock_{};
|
||||
|
||||
hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t count,
|
||||
hipMemcpyKind kind) {
|
||||
hipError_t status = ihipMemcpy_validate(dst, src, count, kind);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
if (dst == nullptr || src == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
size_t sOffset = 0;
|
||||
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
|
||||
@@ -96,6 +95,19 @@ hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t
|
||||
}
|
||||
}
|
||||
|
||||
if (srcMemory != nullptr) {
|
||||
hipError_t status = ihipMemcpy_validate_memory(srcMemory, count, sOffset, /*read_write*/ false);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
if (dstMemory != nullptr) {
|
||||
hipError_t status = ihipMemcpy_validate_memory(dstMemory, count, dOffset, /*read_write*/ true);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -1797,7 +1797,16 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
|
||||
size_t dOffset = 0;
|
||||
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
|
||||
hip::MemcpyType memType = ihipGetMemcpyType(src_, dst_, kind_);
|
||||
|
||||
hip::MemcpyType memType = hipHostToHost;
|
||||
if (srcMemory != nullptr && dstMemory == nullptr) {
|
||||
memType = ihipGetMemcpyType(srcMemory, dst_);
|
||||
} else if (srcMemory == nullptr && dstMemory != nullptr) {
|
||||
memType = ihipGetMemcpyType(src_, dstMemory);
|
||||
} else if (srcMemory != nullptr && dstMemory != nullptr) {
|
||||
memType = ihipGetMemcpyType(srcMemory, dstMemory, kind_);
|
||||
}
|
||||
|
||||
switch (memType) {
|
||||
case hipCopyBuffer:
|
||||
// D2H/H2D source/dst is pinned memory
|
||||
@@ -1855,8 +1864,24 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
if (!AMD_DIRECT_DISPATCH) {
|
||||
WorkerThreadLock_.lock();
|
||||
}
|
||||
status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *stream);
|
||||
hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_);
|
||||
|
||||
hip::MemcpyType type;
|
||||
size_t dOffset, sOffset;
|
||||
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
|
||||
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
|
||||
|
||||
if (dstMemory != nullptr && srcMemory != nullptr) {
|
||||
status = ihipMemcpyCommand(command, dstMemory, srcMemory, count_, kind_, *stream, dOffset,
|
||||
sOffset);
|
||||
type = ihipGetMemcpyType(srcMemory, dstMemory, kind_);
|
||||
} else if (dstMemory == nullptr && srcMemory != nullptr) {
|
||||
status = ihipMemcpyCommand(command, dst_, srcMemory, count_, kind_, *stream, sOffset);
|
||||
type = ihipGetMemcpyType(srcMemory, dst_);
|
||||
} else if (dstMemory != nullptr && srcMemory == nullptr) {
|
||||
status = ihipMemcpyCommand(command, dstMemory, src_, count_, kind_, *stream, dOffset);
|
||||
type = ihipGetMemcpyType(src_, dstMemory);
|
||||
}
|
||||
|
||||
if (type == hipCopyBuffer) {
|
||||
amd::CopyMemoryCommand* cpycmd = reinterpret_cast<amd::CopyMemoryCommand*>(command);
|
||||
amd::CopyMetadata copyMetadata = cpycmd->copyMetadata();
|
||||
@@ -2005,7 +2030,18 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
}
|
||||
virtual bool GraphCaptureEnabled() override {
|
||||
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
|
||||
hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_);
|
||||
hip::MemcpyType type;
|
||||
|
||||
size_t dOffset, sOffset;
|
||||
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
|
||||
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
|
||||
|
||||
// The case below is only interested in hipCopyBuffer,
|
||||
// which is only valid for device to device copies.
|
||||
if (dstMemory != nullptr && srcMemory != nullptr) {
|
||||
type = ihipGetMemcpyType(srcMemory, dstMemory, kind_);
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case hipCopyBuffer:
|
||||
return true;
|
||||
@@ -2052,7 +2088,21 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D {
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *stream);
|
||||
|
||||
size_t devOffset, dOffset;
|
||||
amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset);
|
||||
amd::Memory* dstMemory = getMemoryObject(dst_, dOffset);
|
||||
|
||||
if (devMemory == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (dstMemory != nullptr) {
|
||||
status = ihipMemcpyCommand(command, dstMemory, devMemory, count_, kind_, *stream, dOffset, devOffset);
|
||||
} else {
|
||||
status = ihipMemcpyCommand(command, dst_, devMemory, count_, kind_, *stream, devOffset);
|
||||
}
|
||||
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
@@ -2147,7 +2197,21 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *stream);
|
||||
|
||||
size_t devOffset, sOffset;
|
||||
amd::Memory* devMemory = getMemoryObject(device_ptr, devOffset);
|
||||
amd::Memory* srcMemory = getMemoryObject(src_, sOffset);
|
||||
|
||||
if (devMemory == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (srcMemory != nullptr) {
|
||||
status = ihipMemcpyCommand(command, devMemory, srcMemory, count_, kind_, *stream, devOffset, sOffset);
|
||||
} else {
|
||||
status = ihipMemcpyCommand(command, devMemory, src_, count_, kind_, *stream, devOffset);
|
||||
}
|
||||
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -174,8 +174,7 @@ hipError_t hipExternalMemoryGetMappedBuffer(void** devPtr, hipExternalMemory_t e
|
||||
auto buf = reinterpret_cast<amd::ExternalBuffer*>(extMem);
|
||||
|
||||
// Validate bounds
|
||||
if (bufferDesc->size > buf->getSize() ||
|
||||
bufferDesc->offset > buf->getSize() - bufferDesc->size) {
|
||||
if (bufferDesc->size > buf->getSize() || bufferDesc->offset > buf->getSize() - bufferDesc->size) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -461,188 +460,194 @@ bool IsHtoHMemcpyValid(void* dst, const void* src, hipMemcpyKind kind) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) {
|
||||
if (dst == nullptr || src == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (static_cast<uint32_t>(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) {
|
||||
return hipErrorInvalidMemcpyDirection;
|
||||
}
|
||||
size_t sOffset = 0;
|
||||
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
|
||||
size_t dOffset = 0;
|
||||
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
|
||||
|
||||
if (srcMemory != nullptr) {
|
||||
// Validate Mem Access in case of VMM Memory
|
||||
if (!srcMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], false)) {
|
||||
return hipErrorUnknown;
|
||||
}
|
||||
|
||||
// If the mem object is a VMM sub buffer (subbuffer has parent set),
|
||||
// then use parent's size for validation.
|
||||
if (srcMemory->parent() && (srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
srcMemory = srcMemory->parent();
|
||||
}
|
||||
|
||||
// Size validation
|
||||
if (sizeBytes > (srcMemory->getSize() - sOffset)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hipError_t ihipMemcpy_validate_memory(amd::Memory* memObj, size_t sizeBytes, size_t offset,
|
||||
bool read_write) {
|
||||
// Validate Mem Access in case of VMM Memory
|
||||
if (!memObj->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], read_write)) {
|
||||
return hipErrorUnknown;
|
||||
}
|
||||
|
||||
if (dstMemory != nullptr) {
|
||||
// Validate Mem Access in case of VMM Memory
|
||||
if (!dstMemory->ValidateMemAccess(*hip::getCurrentDevice()->devices()[0], true)) {
|
||||
return hipErrorUnknown;
|
||||
}
|
||||
|
||||
// If the mem object is a VMM sub buffer (subbuffer has parent set),
|
||||
// then use parent's size for validation.
|
||||
if (dstMemory->parent() && (dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
dstMemory = dstMemory->parent();
|
||||
}
|
||||
|
||||
// Size validation
|
||||
if (sizeBytes > (dstMemory->getSize() - dOffset)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
// If the mem object is a VMM sub buffer (subbuffer has parent set),
|
||||
// then use parent's size for validation.
|
||||
if (memObj->parent() && (memObj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
memObj = memObj->parent();
|
||||
}
|
||||
|
||||
// If src and dst ptr are null then kind must be either h2h or def.
|
||||
if (!IsHtoHMemcpyValid(dst, src, kind)) {
|
||||
// Size validation
|
||||
if (sizeBytes > (memObj->getSize() - offset)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hip::MemcpyType ihipGetMemcpyType(const void* src, void* dst, hipMemcpyKind kind) {
|
||||
size_t sOffset = 0;
|
||||
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
|
||||
size_t dOffset = 0;
|
||||
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
|
||||
hip::MemcpyType type;
|
||||
if (srcMemory == nullptr && dstMemory == nullptr) {
|
||||
type = hipHostToHost;
|
||||
} else if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
|
||||
type = hipWriteBuffer;
|
||||
} else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
|
||||
type = hipReadBuffer;
|
||||
} else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
|
||||
// Check if the queue device doesn't match the device on any memory object.
|
||||
// And any of them are not host allocation.
|
||||
// Hence it's a P2P transfer, because the app has requested access to another GPU
|
||||
if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) &&
|
||||
((srcMemory->getContext().devices().size() == 1) &&
|
||||
(dstMemory->getContext().devices().size() == 1))) {
|
||||
type = hipCopyBufferP2P;
|
||||
} else if (kind == hipMemcpyDeviceToDeviceNoCU) {
|
||||
type = hipCopyBufferSDMA;
|
||||
} else {
|
||||
type = hipCopyBuffer;
|
||||
}
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
|
||||
hipMemcpyKind kind, hip::Stream& stream, bool isAsync) {
|
||||
amd::Command::EventWaitList waitList;
|
||||
size_t sOffset = 0;
|
||||
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
|
||||
size_t dOffset = 0;
|
||||
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
|
||||
amd::Device* queueDevice = &stream.device();
|
||||
amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
|
||||
hip::MemcpyType type = ihipGetMemcpyType(src, dst, kind);
|
||||
hip::Stream* pStream = &stream;
|
||||
switch (type) {
|
||||
case hipWriteBuffer:
|
||||
if (queueDevice != dstMemory->GetDeviceById() &&
|
||||
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream.getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
}
|
||||
command = new amd::WriteMemoryCommand(*pStream, CL_COMMAND_WRITE_BUFFER, waitList,
|
||||
*dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0,
|
||||
copyMetadata);
|
||||
break;
|
||||
case hipReadBuffer:
|
||||
if (queueDevice != srcMemory->GetDeviceById() &&
|
||||
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream.getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
}
|
||||
command = new amd::ReadMemoryCommand(*pStream, CL_COMMAND_READ_BUFFER, waitList,
|
||||
*srcMemory->asBuffer(), sOffset, sizeBytes, dst, 0, 0,
|
||||
copyMetadata);
|
||||
break;
|
||||
case hipCopyBufferP2P:
|
||||
command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList,
|
||||
*srcMemory->asBuffer(), *dstMemory->asBuffer(),
|
||||
sOffset, dOffset, sizeBytes);
|
||||
if (command == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
// Make sure runtime has valid memory for the command execution. P2P access
|
||||
// requires page table mapping on the current device to another GPU memory
|
||||
if (!static_cast<amd::CopyMemoryP2PCommand*>(command)->validateMemory()) {
|
||||
delete command;
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
break;
|
||||
case hipCopyBufferSDMA:
|
||||
copyMetadata.copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA;
|
||||
case hipCopyBuffer:
|
||||
if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) &&
|
||||
queueDevice != srcMemory->GetDeviceById()) {
|
||||
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream.getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
} else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) {
|
||||
// Scenarios such as DtoH where dst is pinned memory
|
||||
if ((queueDevice != srcMemory->GetDeviceById()) &&
|
||||
(dstMemory->getContext().devices().size() != 1) &&
|
||||
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream.getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
// Scenarios such as HtoD where src is pinned memory
|
||||
} else if ((queueDevice != dstMemory->GetDeviceById()) &&
|
||||
(srcMemory->getContext().devices().size() != 1) &&
|
||||
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream.getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
}
|
||||
}
|
||||
command = new amd::CopyMemoryCommand(*pStream, CL_COMMAND_COPY_BUFFER, waitList,
|
||||
*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset,
|
||||
dOffset, sizeBytes, copyMetadata);
|
||||
break;
|
||||
}
|
||||
if (command == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
if (waitList.size() > 0) {
|
||||
waitList[0]->release();
|
||||
}
|
||||
hipError_t ihipMemcpy_validate(amd::Memory* dstMemory, amd::Memory* srcMemory, size_t sizeBytes,
|
||||
size_t dstOffset, size_t srcOffset) {
|
||||
hipError_t status;
|
||||
|
||||
status = ihipMemcpy_validate_memory(srcMemory, sizeBytes, srcOffset, /*read_write*/ false);
|
||||
if (status != hipSuccess) return status;
|
||||
status = ihipMemcpy_validate_memory(dstMemory, sizeBytes, dstOffset, /*read_write*/ true);
|
||||
if (status != hipSuccess) return status;
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hip::MemcpyType ihipGetMemcpyType(amd::Memory* src, amd::Memory* dst, hipMemcpyKind kind) {
|
||||
if ((src->GetDeviceById() != dst->GetDeviceById()) &&
|
||||
((src->getContext().devices().size() == 1) && (dst->getContext().devices().size() == 1))) {
|
||||
return hipCopyBufferP2P;
|
||||
} else if (kind == hipMemcpyDeviceToDeviceNoCU) {
|
||||
return hipCopyBufferSDMA;
|
||||
}
|
||||
return hipCopyBuffer;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
// Helper class to manage common memcpy command state and cleanup
|
||||
class MemcpyCommandHelper {
|
||||
public:
|
||||
MemcpyCommandHelper(hip::Stream& stream, bool isAsync)
|
||||
: waitList_(),
|
||||
copyMetadata_(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE),
|
||||
pStream_(&stream),
|
||||
queueDevice_(&stream.device()) {}
|
||||
|
||||
~MemcpyCommandHelper() {
|
||||
// Cleanup: release waitList command if present
|
||||
if (waitList_.size() > 0) {
|
||||
waitList_[0]->release();
|
||||
}
|
||||
}
|
||||
|
||||
// Non-copyable, non-movable
|
||||
MemcpyCommandHelper(const MemcpyCommandHelper&) = delete;
|
||||
MemcpyCommandHelper& operator=(const MemcpyCommandHelper&) = delete;
|
||||
|
||||
amd::Command::EventWaitList& waitList() { return waitList_; }
|
||||
amd::CopyMetadata& copyMetadata() { return copyMetadata_; }
|
||||
hip::Stream*& pStream() { return pStream_; }
|
||||
amd::Device* queueDevice() const { return queueDevice_; }
|
||||
|
||||
// Helper to add wait command from stream
|
||||
void addWaitCommand(hip::Stream& stream) {
|
||||
amd::Command* cmd = stream.getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList_.push_back(cmd);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to switch stream and add wait command
|
||||
void switchStreamAndWait(hip::Stream& originalStream, amd::Context& context) {
|
||||
pStream_ = hip::getNullStream(context);
|
||||
addWaitCommand(originalStream);
|
||||
}
|
||||
|
||||
// Common error handling for command creation
|
||||
static hipError_t checkCommand(amd::Command* command) {
|
||||
if (command == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
private:
|
||||
amd::Command::EventWaitList waitList_;
|
||||
amd::CopyMetadata copyMetadata_;
|
||||
hip::Stream* pStream_;
|
||||
amd::Device* queueDevice_;
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, const void* srcMemory,
|
||||
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
|
||||
size_t dstOffset, bool isAsync) {
|
||||
MemcpyCommandHelper helper(stream, isAsync);
|
||||
|
||||
if (&stream.device() != dstMemory->GetDeviceById() &&
|
||||
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context());
|
||||
}
|
||||
command = new amd::WriteMemoryCommand(
|
||||
*helper.pStream(), CL_COMMAND_WRITE_BUFFER, helper.waitList(), *dstMemory->asBuffer(),
|
||||
dstOffset, sizeBytes, srcMemory, 0, 0, helper.copyMetadata());
|
||||
return MemcpyCommandHelper::checkCommand(command);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, void* dstMemory, amd::Memory* srcMemory,
|
||||
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
|
||||
size_t srcOffset, bool isAsync) {
|
||||
MemcpyCommandHelper helper(stream, isAsync);
|
||||
|
||||
if (helper.queueDevice() != srcMemory->GetDeviceById() &&
|
||||
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context());
|
||||
}
|
||||
command = new amd::ReadMemoryCommand(*helper.pStream(), CL_COMMAND_READ_BUFFER, helper.waitList(),
|
||||
*srcMemory->asBuffer(), srcOffset, sizeBytes,
|
||||
dstMemory, 0, 0, helper.copyMetadata());
|
||||
return MemcpyCommandHelper::checkCommand(command);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t ihipMemcpyCommand(amd::Command*& command, amd::Memory* dstMemory, amd::Memory* srcMemory,
|
||||
size_t sizeBytes, hipMemcpyKind kind, hip::Stream& stream,
|
||||
size_t dstOffset, size_t srcOffset, bool isAsync) {
|
||||
MemcpyCommandHelper helper(stream, isAsync);
|
||||
|
||||
hip::MemcpyType type = ihipGetMemcpyType(srcMemory, dstMemory, kind);
|
||||
switch (type) {
|
||||
case hipCopyBufferP2P:
|
||||
command = new amd::CopyMemoryP2PCommand(
|
||||
stream, CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(),
|
||||
*dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes);
|
||||
{
|
||||
hipError_t status = MemcpyCommandHelper::checkCommand(command);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
// Make sure runtime has valid memory for the command execution. P2P access
|
||||
// requires page table mapping on the current device to another GPU memory
|
||||
if (!static_cast<amd::CopyMemoryP2PCommand*>(command)->validateMemory()) {
|
||||
delete command;
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case hipCopyBufferSDMA:
|
||||
helper.copyMetadata().copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::SDMA;
|
||||
case hipCopyBuffer:
|
||||
if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) &&
|
||||
helper.queueDevice() != srcMemory->GetDeviceById()) {
|
||||
helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context());
|
||||
} else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) {
|
||||
// Scenarios such as DtoH where dst is pinned memory
|
||||
if ((helper.queueDevice() != srcMemory->GetDeviceById()) &&
|
||||
(dstMemory->getContext().devices().size() != 1) &&
|
||||
!(srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
helper.switchStreamAndWait(stream, srcMemory->GetDeviceById()->context());
|
||||
// Scenarios such as HtoD where src is pinned memory
|
||||
} else if ((helper.queueDevice() != dstMemory->GetDeviceById()) &&
|
||||
(srcMemory->getContext().devices().size() != 1) &&
|
||||
!(dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
|
||||
helper.switchStreamAndWait(stream, dstMemory->GetDeviceById()->context());
|
||||
}
|
||||
}
|
||||
command = new amd::CopyMemoryCommand(
|
||||
*helper.pStream(), CL_COMMAND_COPY_BUFFER, helper.waitList(), *srcMemory->asBuffer(),
|
||||
*dstMemory->asBuffer(), srcOffset, dstOffset, sizeBytes,
|
||||
helper.copyMetadata());
|
||||
break;
|
||||
case hipHostToHost:
|
||||
assert(false && "Unreachable case");
|
||||
break;
|
||||
}
|
||||
return MemcpyCommandHelper::checkCommand(command);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool IsHtoHMemcpy(void* dst, const void* src) {
|
||||
size_t sOffset = 0;
|
||||
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
|
||||
@@ -668,43 +673,79 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
|
||||
// Skip if nothing needs writing.
|
||||
return hipSuccess;
|
||||
}
|
||||
status = ihipMemcpy_validate(dst, src, sizeBytes, kind);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
if (dst == nullptr || src == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (static_cast<uint32_t>(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) {
|
||||
return hipErrorInvalidMemcpyDirection;
|
||||
}
|
||||
if (src == dst && kind == hipMemcpyDefault) {
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
size_t sOffset = 0;
|
||||
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
|
||||
amd::Memory* srcDeviceMemory = getMemoryObject(src, sOffset);
|
||||
size_t dOffset = 0;
|
||||
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
|
||||
|
||||
hipMemoryType srcMemoryType = getMemoryType(srcMemory);
|
||||
hipMemoryType dstMemoryType = getMemoryType(dstMemory);
|
||||
|
||||
if (srcMemory == nullptr && dstMemory == nullptr) {
|
||||
ihipHtoHMemcpy(dst, src, sizeBytes, stream);
|
||||
return hipSuccess;
|
||||
} else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
|
||||
((srcMemory != nullptr) && (dstMemory == nullptr))) {
|
||||
// Unpinned copy wait behavior is enforced in the lower copy layers so skip
|
||||
// wait at top level except for MT path
|
||||
isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
|
||||
} else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
|
||||
// Device to Device copies do not need to host side synchronization.
|
||||
if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
|
||||
(!srcMemory->getUserData().sync_mem_ops_ || !dstMemory->getUserData().sync_mem_ops_)) {
|
||||
isHostAsync = true;
|
||||
}
|
||||
// Any Host to any Host need host side synchronization.
|
||||
if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) {
|
||||
isHostAsync = false;
|
||||
}
|
||||
amd::Memory* dstDeviceMemory = getMemoryObject(dst, dOffset);
|
||||
|
||||
// Handle kind vs memobject miss matches
|
||||
if (kind == hipMemcpyDeviceToHost && srcDeviceMemory == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (kind == hipMemcpyHostToDevice && dstDeviceMemory == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
amd::Command* command = nullptr;
|
||||
status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isHostAsync);
|
||||
if (srcDeviceMemory == nullptr && dstDeviceMemory == nullptr) {
|
||||
ihipHtoHMemcpy(dst, src, sizeBytes, stream);
|
||||
return hipSuccess;
|
||||
} else if (dstDeviceMemory == nullptr || srcDeviceMemory == nullptr) {
|
||||
// Unpinned copy wait behavior is enforced in the lower copy layers so skip
|
||||
// wait at top level except for MT path
|
||||
isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
|
||||
if (dstDeviceMemory != nullptr) {
|
||||
status = ihipMemcpy_validate_memory(dstDeviceMemory, sizeBytes, dOffset, /*read_write*/ true);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
status =
|
||||
ihipMemcpyCommand(command, dstDeviceMemory, src, sizeBytes, kind, stream, dOffset, isHostAsync);
|
||||
} else {
|
||||
status =
|
||||
ihipMemcpy_validate_memory(srcDeviceMemory, sizeBytes, sOffset, /*read_write*/ false);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
status =
|
||||
ihipMemcpyCommand(command, dst, srcDeviceMemory, sizeBytes, kind, stream, sOffset, isHostAsync);
|
||||
}
|
||||
} else {
|
||||
// Both are AMD memory
|
||||
hipMemoryType srcMemoryType = getMemoryType(srcDeviceMemory);
|
||||
hipMemoryType dstMemoryType = getMemoryType(dstDeviceMemory);
|
||||
|
||||
status = ihipMemcpy_validate(dstDeviceMemory, srcDeviceMemory, sizeBytes, dOffset, sOffset);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
if (srcDeviceMemory->GetDeviceById() == dstDeviceMemory->GetDeviceById()) {
|
||||
// Device to Device copies do not need to host side synchronization.
|
||||
if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
|
||||
(!srcDeviceMemory->getUserData().sync_mem_ops_ ||
|
||||
!dstDeviceMemory->getUserData().sync_mem_ops_)) {
|
||||
isHostAsync = true;
|
||||
}
|
||||
// Any Host to any Host need host side synchronization.
|
||||
if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) {
|
||||
isHostAsync = false;
|
||||
}
|
||||
}
|
||||
status = ihipMemcpyCommand(command, dstDeviceMemory, srcDeviceMemory, sizeBytes, kind, stream,
|
||||
dOffset, sOffset, isHostAsync);
|
||||
}
|
||||
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
@@ -712,7 +753,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
|
||||
if (!isHostAsync) {
|
||||
command->queue()->finishCommand(command);
|
||||
} else if (!isGPUAsync) {
|
||||
hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
|
||||
hip::Stream* pStream = hip::getNullStream(dstDeviceMemory->GetDeviceById()->context());
|
||||
amd::Command::EventWaitList waitList;
|
||||
waitList.push_back(command);
|
||||
amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList);
|
||||
@@ -3684,7 +3725,7 @@ hipError_t ihipPointerGetAttributes(void* data, hipPointer_attribute attribute,
|
||||
case HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE: {
|
||||
if (memObj) {
|
||||
if (getMemoryType(memObj) == hipMemoryTypeHost) {
|
||||
// host pointer, pinned or registered memory
|
||||
// host pointer, pinned or registered memory
|
||||
*reinterpret_cast<int*>(data) = 0;
|
||||
} else if ((memObj->getMemFlags() & kManagedAlloc) == kManagedAlloc) {
|
||||
// managed allocation
|
||||
|
||||
Referens i nytt ärende
Block a user