SWDEV-472357 - support Rect copy with staging buffer for 2D & 3D memcpy in PAL
Change-Id: Ie32f3e5a6fa077f6b2db20fc1ab1e2e0da8344cb
[ROCm/clr commit: 41dc4545fc]
Этот коммит содержится в:
@@ -1879,20 +1879,67 @@ hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void*
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
amd::Command::EventWaitList waitList;
|
||||
amd::CopyMemoryCommand* copyCommand;
|
||||
amd::Device* queueDevice = &stream->device();
|
||||
amd::Coord3D srcStart(srcRect.start_, 0, 0);
|
||||
amd::Coord3D dstStart(dstRect.start_, 0, 0);
|
||||
amd::CopyMemoryCommand* copyCommand = new amd::CopyMemoryCommand(
|
||||
*stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory,
|
||||
srcStart, dstStart, copyRegion, srcRect, dstRect);
|
||||
|
||||
bool p2pcopy = false;
|
||||
// Check if the queue device doesn't match the device on any memory object.
|
||||
// And any of them are not host allocation.
|
||||
// Hence it's a P2P transfer, because the app has requested access to another GPU
|
||||
if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) &&
|
||||
((srcMemory->getContext().devices().size() == 1) &&
|
||||
(dstMemory->getContext().devices().size() == 1))) {
|
||||
copyCommand = new amd::CopyMemoryP2PCommand(*stream, CL_COMMAND_COPY_BUFFER_RECT, waitList,
|
||||
*srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect);
|
||||
p2pcopy = true;
|
||||
} else {
|
||||
hip::Stream* pStream = stream;
|
||||
if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) &&
|
||||
(queueDevice != srcMemory->GetDeviceById())) {
|
||||
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream->getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
} else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) {
|
||||
// Scenarios such as DtoH where dst is pinned memory
|
||||
if ((queueDevice != srcMemory->GetDeviceById()) &&
|
||||
(dstMemory->getContext().devices().size() != 1)) {
|
||||
pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream->getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
// Scenarios such as HtoD where src is pinned memory
|
||||
} else if ((queueDevice != dstMemory->GetDeviceById()) &&
|
||||
(srcMemory->getContext().devices().size() != 1)) {
|
||||
pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
|
||||
amd::Command* cmd = stream->getLastQueuedCommand(true);
|
||||
if (cmd != nullptr) {
|
||||
waitList.push_back(cmd);
|
||||
}
|
||||
}
|
||||
}
|
||||
copyCommand = new amd::CopyMemoryCommand( *pStream, CL_COMMAND_COPY_BUFFER_RECT, waitList,
|
||||
*srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect);
|
||||
}
|
||||
|
||||
if (copyCommand == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
|
||||
if (!copyCommand->validatePeerMemory()) {
|
||||
// Make sure runtime has valid memory for the command execution. P2P access
|
||||
// requires page table mapping on the current device to another GPU memory
|
||||
if ((p2pcopy && !static_cast<amd::CopyMemoryP2PCommand*>(copyCommand)->validateMemory()) ||
|
||||
(!p2pcopy && !copyCommand->validatePeerMemory())) {
|
||||
delete copyCommand;
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (waitList.size() > 0) {
|
||||
waitList[0]->release();
|
||||
}
|
||||
command = copyCommand;
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
@@ -1979,7 +1979,67 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_BUFFER_RECT:
|
||||
case CL_COMMAND_COPY_BUFFER_RECT: {
|
||||
if (p2pAllowed) {
|
||||
result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size,
|
||||
cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
} else {
|
||||
amd::ScopedLock lock(dev().P2PStageOps());
|
||||
Memory* dstStgMem = static_cast<pal::Memory*>(
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
|
||||
Memory* srcStgMem = static_cast<pal::Memory*>(
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
|
||||
|
||||
if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) {
|
||||
result = true;
|
||||
// Perform 2 step transfer with staging buffer
|
||||
result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(),
|
||||
cmd.srcRect(), size, false,
|
||||
cmd.copyMetadata());
|
||||
|
||||
result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(),
|
||||
cmd.dstRect(), size, false,
|
||||
cmd.copyMetadata());
|
||||
}
|
||||
else {
|
||||
size_t srcOffset;
|
||||
size_t dstOffset;
|
||||
result = true;
|
||||
|
||||
for (size_t z = 0; z < size[2]; ++z) {
|
||||
for (size_t y = 0; y < size[1]; ++y) {
|
||||
srcOffset = cmd.srcRect().offset(0, y, z);
|
||||
dstOffset = cmd.dstRect().offset(0, y, z);
|
||||
|
||||
amd::Coord3D srcOrigin(srcOffset);
|
||||
amd::Coord3D dstOrigin(dstOffset);
|
||||
size_t copy_size = Device::kP2PStagingSize;
|
||||
size_t left_size = size[0];
|
||||
amd::Coord3D stageOffset(0);
|
||||
do {
|
||||
if (left_size <= copy_size) {
|
||||
copy_size = left_size;
|
||||
}
|
||||
left_size -= copy_size;
|
||||
|
||||
// Perform 2 step transfer with staging buffer
|
||||
result &= srcDevMem->partialMemCopyTo(*(srcDevMem->dev().xferQueue()), srcOrigin,
|
||||
stageOffset, copy_size, *dstStgMem);
|
||||
srcDevMem->dev().xferQueue()->waitAllEngines();
|
||||
|
||||
result &= srcStgMem->partialMemCopyTo(*(dstDevMem->dev().xferQueue()), stageOffset,
|
||||
dstOrigin, copy_size, *dstDevMem);
|
||||
srcStgMem->dev().xferQueue()->waitAllEngines();
|
||||
|
||||
srcOrigin.c[0] += copy_size;
|
||||
dstOrigin.c[0] += copy_size;
|
||||
} while (left_size > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_IMAGE:
|
||||
case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
|
||||
case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
|
||||
|
||||
@@ -2215,7 +2215,68 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_BUFFER_RECT:
|
||||
case CL_COMMAND_COPY_BUFFER_RECT: {
|
||||
if (p2pAllowed) {
|
||||
result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size,
|
||||
cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
} else {
|
||||
// Sync the current queue, since P2P staging uses the device queues for transfer
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
amd::ScopedLock lock(dev().P2PStageOps());
|
||||
Memory* dstStgMem = static_cast<Memory*>(
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
|
||||
Memory* srcStgMem = static_cast<Memory*>(
|
||||
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
|
||||
|
||||
if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) {
|
||||
result = true;
|
||||
// Perform 2 step transfer with staging buffer
|
||||
result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(),
|
||||
cmd.srcRect(), size, false,
|
||||
cmd.copyMetadata());
|
||||
|
||||
result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(),
|
||||
cmd.dstRect(), size, false,
|
||||
cmd.copyMetadata());
|
||||
}
|
||||
else {
|
||||
size_t srcOffset;
|
||||
size_t dstOffset;
|
||||
result = true;
|
||||
|
||||
for (size_t z = 0; z < size[2]; ++z) {
|
||||
for (size_t y = 0; y < size[1]; ++y) {
|
||||
srcOffset = cmd.srcRect().offset(0, y, z);
|
||||
dstOffset = cmd.dstRect().offset(0, y, z);
|
||||
|
||||
amd::Coord3D srcOrigin(srcOffset);
|
||||
amd::Coord3D dstOrigin(dstOffset);
|
||||
size_t copy_size = Device::kP2PStagingSize;
|
||||
size_t left_size = size[0];
|
||||
amd::Coord3D stageOffset(0);
|
||||
do {
|
||||
if (left_size <= copy_size) {
|
||||
copy_size = left_size;
|
||||
}
|
||||
left_size -= copy_size;
|
||||
|
||||
// Perform 2 step transfer with staging buffer
|
||||
result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin,
|
||||
stageOffset, copy_size);
|
||||
|
||||
result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
|
||||
dstOrigin, copy_size);
|
||||
|
||||
srcOrigin.c[0] += copy_size;
|
||||
dstOrigin.c[0] += copy_size;
|
||||
} while (left_size > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_IMAGE:
|
||||
case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
|
||||
case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
|
||||
|
||||
@@ -1749,6 +1749,13 @@ class CopyMemoryP2PCommand : public CopyMemoryCommand {
|
||||
: CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin,
|
||||
size) {}
|
||||
|
||||
CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList,
|
||||
Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin,
|
||||
Coord3D size, const BufferRect& srcRect, const BufferRect& dstRect,
|
||||
amd::CopyMetadata copyMetadata = amd::CopyMetadata())
|
||||
: CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin,
|
||||
size, srcRect, dstRect) {}
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) { device.submitCopyMemoryP2P(*this); }
|
||||
|
||||
bool validateMemory();
|
||||
|
||||
Ссылка в новой задаче
Block a user