diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 6061f346aa..e41d6ed686 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -1879,20 +1879,67 @@ hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void* if (status != hipSuccess) { return status; } + amd::Command::EventWaitList waitList; + amd::CopyMemoryCommand* copyCommand; + amd::Device* queueDevice = &stream->device(); amd::Coord3D srcStart(srcRect.start_, 0, 0); amd::Coord3D dstStart(dstRect.start_, 0, 0); - amd::CopyMemoryCommand* copyCommand = new amd::CopyMemoryCommand( - *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory, - srcStart, dstStart, copyRegion, srcRect, dstRect); + + bool p2pcopy = false; + // Check if the queue device doesn't match the device on any memory object. + // And any of them are not host allocation. + // Hence it's a P2P transfer, because the app has requested access to another GPU + if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) && + ((srcMemory->getContext().devices().size() == 1) && + (dstMemory->getContext().devices().size() == 1))) { + copyCommand = new amd::CopyMemoryP2PCommand(*stream, CL_COMMAND_COPY_BUFFER_RECT, waitList, + *srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect); + p2pcopy = true; + } else { + hip::Stream* pStream = stream; + if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) && + (queueDevice != srcMemory->GetDeviceById())) { + pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); + amd::Command* cmd = stream->getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } + } else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) { + // Scenarios such as DtoH where dst is pinned memory + if ((queueDevice != srcMemory->GetDeviceById()) && + (dstMemory->getContext().devices().size() != 1)) { + pStream = hip::getNullStream(srcMemory->GetDeviceById()->context()); + amd::Command* cmd = stream->getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } + // Scenarios such as HtoD where src is pinned memory + } else if ((queueDevice != dstMemory->GetDeviceById()) && + (srcMemory->getContext().devices().size() != 1)) { + pStream = hip::getNullStream(dstMemory->GetDeviceById()->context()); + amd::Command* cmd = stream->getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } + } + } + copyCommand = new amd::CopyMemoryCommand( *pStream, CL_COMMAND_COPY_BUFFER_RECT, waitList, + *srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect); + } if (copyCommand == nullptr) { return hipErrorOutOfMemory; } - - if (!copyCommand->validatePeerMemory()) { + // Make sure runtime has valid memory for the command execution. P2P access + // requires page table mapping on the current device to another GPU memory + if ((p2pcopy && !static_cast(copyCommand)->validateMemory()) || + (!p2pcopy && !copyCommand->validatePeerMemory())) { delete copyCommand; return hipErrorInvalidValue; } + if (waitList.size() > 0) { + waitList[0]->release(); + } command = copyCommand; return hipSuccess; } diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 0cb4e32f1a..505cd2f832 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -1979,7 +1979,67 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { } break; } - case CL_COMMAND_COPY_BUFFER_RECT: + case CL_COMMAND_COPY_BUFFER_RECT: { + if (p2pAllowed) { + result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size, + cmd.isEntireMemory(), cmd.copyMetadata()); + } else { + amd::ScopedLock lock(dev().P2PStageOps()); + Memory* dstStgMem = static_cast( + dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0])); + Memory* srcStgMem = static_cast( + dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0])); + + if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) { + result = true; + // Perform 2 step transfer with staging buffer + result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(), + cmd.srcRect(), size, false, + cmd.copyMetadata()); + + result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(), + cmd.dstRect(), size, false, + cmd.copyMetadata()); + } + else { + size_t srcOffset; + size_t dstOffset; + result = true; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = cmd.srcRect().offset(0, y, z); + dstOffset = cmd.dstRect().offset(0, y, z); + + amd::Coord3D srcOrigin(srcOffset); + amd::Coord3D dstOrigin(dstOffset); + size_t copy_size = Device::kP2PStagingSize; + size_t left_size = size[0]; + amd::Coord3D stageOffset(0); + do { + if (left_size <= copy_size) { + copy_size = left_size; + } + left_size -= copy_size; + + // Perform 2 step transfer with staging buffer + result &= srcDevMem->partialMemCopyTo(*(srcDevMem->dev().xferQueue()), srcOrigin, + stageOffset, copy_size, *dstStgMem); + srcDevMem->dev().xferQueue()->waitAllEngines(); + + result &= srcStgMem->partialMemCopyTo(*(dstDevMem->dev().xferQueue()), stageOffset, + dstOrigin, copy_size, *dstDevMem); + srcStgMem->dev().xferQueue()->waitAllEngines(); + + srcOrigin.c[0] += copy_size; + dstOrigin.c[0] += copy_size; + } while (left_size > 0); + } + } + } + } + break; + } case CL_COMMAND_COPY_IMAGE: case CL_COMMAND_COPY_IMAGE_TO_BUFFER: case CL_COMMAND_COPY_BUFFER_TO_IMAGE: diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index e7d91dbf1a..d18e3c4c5c 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2215,7 +2215,68 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { } break; } - case CL_COMMAND_COPY_BUFFER_RECT: + case CL_COMMAND_COPY_BUFFER_RECT: { + if (p2pAllowed) { + result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size, + cmd.isEntireMemory(), cmd.copyMetadata()); + } else { + // Sync the current queue, since P2P staging uses the device queues for transfer + releaseGpuMemoryFence(); + + amd::ScopedLock lock(dev().P2PStageOps()); + Memory* dstStgMem = static_cast( + dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0])); + Memory* srcStgMem = static_cast( + dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0])); + + if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) { + result = true; + // Perform 2 step transfer with staging buffer + result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(), + cmd.srcRect(), size, false, + cmd.copyMetadata()); + + result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(), + cmd.dstRect(), size, false, + cmd.copyMetadata()); + } + else { + size_t srcOffset; + size_t dstOffset; + result = true; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = cmd.srcRect().offset(0, y, z); + dstOffset = cmd.dstRect().offset(0, y, z); + + amd::Coord3D srcOrigin(srcOffset); + amd::Coord3D dstOrigin(dstOffset); + size_t copy_size = Device::kP2PStagingSize; + size_t left_size = size[0]; + amd::Coord3D stageOffset(0); + do { + if (left_size <= copy_size) { + copy_size = left_size; + } + left_size -= copy_size; + + // Perform 2 step transfer with staging buffer + result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin, + stageOffset, copy_size); + + result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset, + dstOrigin, copy_size); + + srcOrigin.c[0] += copy_size; + dstOrigin.c[0] += copy_size; + } while (left_size > 0); + } + } + } + } + break; + } case CL_COMMAND_COPY_IMAGE: case CL_COMMAND_COPY_IMAGE_TO_BUFFER: case CL_COMMAND_COPY_BUFFER_TO_IMAGE: diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 1001225a91..3bccbbd2ea 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -1749,6 +1749,13 @@ class CopyMemoryP2PCommand : public CopyMemoryCommand { : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin, size) {} + CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin, + Coord3D size, const BufferRect& srcRect, const BufferRect& dstRect, + amd::CopyMetadata copyMetadata = amd::CopyMetadata()) + : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin, + size, srcRect, dstRect) {} + virtual void submit(device::VirtualDevice& device) { device.submitCopyMemoryP2P(*this); } bool validateMemory();