SWDEV-472357 - support Rect copy with staging buffer for 2D & 3D memcpy in PAL

Change-Id: Ie32f3e5a6fa077f6b2db20fc1ab1e2e0da8344cb


[ROCm/clr commit: 41dc4545fc]
Этот коммит содержится в:
Todd tiantuo Li
2024-07-09 05:31:43 -07:00
родитель 55945b16c0
Коммит 170e45b879
4 изменённых файлов: 182 добавлений и 7 удалений
+52 -5
Просмотреть файл
@@ -1879,20 +1879,67 @@ hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void*
if (status != hipSuccess) {
return status;
}
amd::Command::EventWaitList waitList;
amd::CopyMemoryCommand* copyCommand;
amd::Device* queueDevice = &stream->device();
amd::Coord3D srcStart(srcRect.start_, 0, 0);
amd::Coord3D dstStart(dstRect.start_, 0, 0);
amd::CopyMemoryCommand* copyCommand = new amd::CopyMemoryCommand(
*stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory,
srcStart, dstStart, copyRegion, srcRect, dstRect);
bool p2pcopy = false;
// Check if the queue device doesn't match the device on any memory object.
// And any of them are not host allocation.
// Hence it's a P2P transfer, because the app has requested access to another GPU
if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) &&
((srcMemory->getContext().devices().size() == 1) &&
(dstMemory->getContext().devices().size() == 1))) {
copyCommand = new amd::CopyMemoryP2PCommand(*stream, CL_COMMAND_COPY_BUFFER_RECT, waitList,
*srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect);
p2pcopy = true;
} else {
hip::Stream* pStream = stream;
if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) &&
(queueDevice != srcMemory->GetDeviceById())) {
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
amd::Command* cmd = stream->getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
} else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) {
// Scenarios such as DtoH where dst is pinned memory
if ((queueDevice != srcMemory->GetDeviceById()) &&
(dstMemory->getContext().devices().size() != 1)) {
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
amd::Command* cmd = stream->getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
// Scenarios such as HtoD where src is pinned memory
} else if ((queueDevice != dstMemory->GetDeviceById()) &&
(srcMemory->getContext().devices().size() != 1)) {
pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
amd::Command* cmd = stream->getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
}
}
copyCommand = new amd::CopyMemoryCommand( *pStream, CL_COMMAND_COPY_BUFFER_RECT, waitList,
*srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect);
}
if (copyCommand == nullptr) {
return hipErrorOutOfMemory;
}
if (!copyCommand->validatePeerMemory()) {
// Make sure runtime has valid memory for the command execution. P2P access
// requires page table mapping on the current device to another GPU memory
if ((p2pcopy && !static_cast<amd::CopyMemoryP2PCommand*>(copyCommand)->validateMemory()) ||
(!p2pcopy && !copyCommand->validatePeerMemory())) {
delete copyCommand;
return hipErrorInvalidValue;
}
if (waitList.size() > 0) {
waitList[0]->release();
}
command = copyCommand;
return hipSuccess;
}
+61 -1
Просмотреть файл
@@ -1979,7 +1979,67 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
}
break;
}
case CL_COMMAND_COPY_BUFFER_RECT:
case CL_COMMAND_COPY_BUFFER_RECT: {
if (p2pAllowed) {
result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size,
cmd.isEntireMemory(), cmd.copyMetadata());
} else {
amd::ScopedLock lock(dev().P2PStageOps());
Memory* dstStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* srcStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) {
result = true;
// Perform 2 step transfer with staging buffer
result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(),
cmd.srcRect(), size, false,
cmd.copyMetadata());
result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(),
cmd.dstRect(), size, false,
cmd.copyMetadata());
}
else {
size_t srcOffset;
size_t dstOffset;
result = true;
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
srcOffset = cmd.srcRect().offset(0, y, z);
dstOffset = cmd.dstRect().offset(0, y, z);
amd::Coord3D srcOrigin(srcOffset);
amd::Coord3D dstOrigin(dstOffset);
size_t copy_size = Device::kP2PStagingSize;
size_t left_size = size[0];
amd::Coord3D stageOffset(0);
do {
if (left_size <= copy_size) {
copy_size = left_size;
}
left_size -= copy_size;
// Perform 2 step transfer with staging buffer
result &= srcDevMem->partialMemCopyTo(*(srcDevMem->dev().xferQueue()), srcOrigin,
stageOffset, copy_size, *dstStgMem);
srcDevMem->dev().xferQueue()->waitAllEngines();
result &= srcStgMem->partialMemCopyTo(*(dstDevMem->dev().xferQueue()), stageOffset,
dstOrigin, copy_size, *dstDevMem);
srcStgMem->dev().xferQueue()->waitAllEngines();
srcOrigin.c[0] += copy_size;
dstOrigin.c[0] += copy_size;
} while (left_size > 0);
}
}
}
}
break;
}
case CL_COMMAND_COPY_IMAGE:
case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+62 -1
Просмотреть файл
@@ -2215,7 +2215,68 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
}
break;
}
case CL_COMMAND_COPY_BUFFER_RECT:
case CL_COMMAND_COPY_BUFFER_RECT: {
if (p2pAllowed) {
result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size,
cmd.isEntireMemory(), cmd.copyMetadata());
} else {
// Sync the current queue, since P2P staging uses the device queues for transfer
releaseGpuMemoryFence();
amd::ScopedLock lock(dev().P2PStageOps());
Memory* dstStgMem = static_cast<Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* srcStgMem = static_cast<Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) {
result = true;
// Perform 2 step transfer with staging buffer
result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(),
cmd.srcRect(), size, false,
cmd.copyMetadata());
result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(),
cmd.dstRect(), size, false,
cmd.copyMetadata());
}
else {
size_t srcOffset;
size_t dstOffset;
result = true;
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
srcOffset = cmd.srcRect().offset(0, y, z);
dstOffset = cmd.dstRect().offset(0, y, z);
amd::Coord3D srcOrigin(srcOffset);
amd::Coord3D dstOrigin(dstOffset);
size_t copy_size = Device::kP2PStagingSize;
size_t left_size = size[0];
amd::Coord3D stageOffset(0);
do {
if (left_size <= copy_size) {
copy_size = left_size;
}
left_size -= copy_size;
// Perform 2 step transfer with staging buffer
result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin,
stageOffset, copy_size);
result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
dstOrigin, copy_size);
srcOrigin.c[0] += copy_size;
dstOrigin.c[0] += copy_size;
} while (left_size > 0);
}
}
}
}
break;
}
case CL_COMMAND_COPY_IMAGE:
case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+7
Просмотреть файл
@@ -1749,6 +1749,13 @@ class CopyMemoryP2PCommand : public CopyMemoryCommand {
: CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin,
size) {}
CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList,
Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin,
Coord3D size, const BufferRect& srcRect, const BufferRect& dstRect,
amd::CopyMetadata copyMetadata = amd::CopyMetadata())
: CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin,
size, srcRect, dstRect) {}
virtual void submit(device::VirtualDevice& device) { device.submitCopyMemoryP2P(*this); }
bool validateMemory();