SWDEV-363074 - Clean-up sync between SDMA and compute

HIP can't rely on the resource tracking, used in OCL and requires different explicit sync.
Make sure ROCCLR syncs compute only when SDMA is used and vise versa.
The new logic will allow to enable CPDMA without unnecessary waits.

Change-Id: Ib9d1788cfd5afa5ea2fec4c96a37d8b9c4d0059d


[ROCm/clr commit: ff6b4db70b]
This commit is contained in:
German
2022-10-27 18:22:38 -04:00
committed by German Andryeyev
parent 30ec1ca8df
commit c04e2300c8
4 changed files with 42 additions and 47 deletions
+15 -26
View File
@@ -47,8 +47,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const {
bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** xferBuf,
size_t origin, size_t& offset, size_t& totalSize,
size_t xferSize) const {
gpu().releaseGpuMemoryFence();
amd::Coord3D dst(0, 0, 0);
size_t tmpSize;
uint idxWrite = 0;
@@ -120,11 +118,10 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory**
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
gpu().releaseGpuMemoryFence();
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
} else {
size_t srcSize = size[0];
@@ -210,11 +207,10 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
const amd::Coord3D& size, bool entire) const {
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
gpu().releaseGpuMemoryFence();
return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
} else {
Memory& xferBuf = dev().xferRead().acquire();
@@ -293,7 +289,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
} else {
chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize());
chunkSize = std::max(chunkSize, 64 * Ki);
flushDMA = true;
flushDMA = (xferSize > chunkSize);
}
size_t srcOffset = 0;
uint32_t flags = Resource::NoWait;
@@ -332,13 +328,12 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBuffer_ ||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
} else {
size_t dstSize = size[0];
@@ -401,12 +396,12 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
}
}
if (dstSize != 0) {
Memory& xferBuf = gpu().xferWrite().Acquire(dstSize);
while (dstSize > 0) {
auto xfer_size = std::min(dstSize, gpu().xferWrite().MaxSize());
Memory& xferBuf = gpu().xferWrite().Acquire(xfer_size);
// Write memory using a staged resource
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize,
dstSize)) {
xfer_size)) {
LogError("DmaBlitManager::writeBuffer failed!");
return false;
}
@@ -422,13 +417,12 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
const amd::BufferRect& hostRect,
const amd::BufferRect& bufRect, const amd::Coord3D& size,
bool entire) const {
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBufferRect_ ||
(dstMemory.isHostMemDirectAccess() &&
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
gpuMem(dstMemory).isPersistentDirectMap()) {
gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
} else {
Memory& xferBuf = gpu().xferWrite().Acquire(std::min(gpu().xferWrite().MaxSize(), size[0]));
@@ -481,7 +475,6 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
size_t rowPitch, size_t slicePitch, bool entire) const {
gpu().releaseGpuMemoryFence();
if (setup_.disableWriteImage_) {
return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
entire);
@@ -497,11 +490,10 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire) const {
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyBuffer_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
!dev().settings().apuSystem_ && gpuMem(dstMemory).isHostMemDirectAccess())) {
gpu().releaseGpuMemoryFence();
return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size);
} else {
return gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
@@ -513,11 +505,10 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
const amd::Coord3D& size, bool entire) const {
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyBufferRect_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
gpuMem(dstMemory).isHostMemDirectAccess())) {
gpu().releaseGpuMemoryFence();
return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
} else {
size_t srcOffset;
@@ -591,10 +582,8 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
bool result = false;
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyImageToBuffer_) {
gpu().releaseGpuMemoryFence();
result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
entire, rowPitch, slicePitch);
} else {
@@ -604,6 +593,7 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
// Check if a HostBlit transfer is required
if (completeOperation_ && !result) {
gpu().releaseGpuMemoryFence();
result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
entire, rowPitch, slicePitch);
}
@@ -617,10 +607,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
bool result = false;
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyBufferToImage_) {
gpu().releaseGpuMemoryFence();
result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
entire, rowPitch, slicePitch);
} else {
@@ -630,6 +618,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
// Check if a HostBlit transfer is required
if (completeOperation_ && !result) {
gpu().releaseGpuMemoryFence();
result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
entire, rowPitch, slicePitch);
}
@@ -1430,6 +1430,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
// Make sure compute is done before CP DMA start
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
} else {
gpu.releaseGpuMemoryFence();
gpu.engineID_ = SdmaEngine;
}
+10 -12
View File
@@ -61,7 +61,8 @@ uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType
return allocedQueues;
}
VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType,
// ================================================================================================
VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType queueType,
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
uint rtCU, amd::CommandQueue::Priority priority,
uint64_t residency_limit, uint max_command_buffers) {
@@ -341,6 +342,7 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b
palDoppRefs_.push_back(doppRef);
}
// ================================================================================================
bool VirtualGPU::Queue::flush() {
if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
if (Pal::Result::Success !=
@@ -381,6 +383,13 @@ bool VirtualGPU::Queue::flush() {
submitInfo.fenceCount = 1;
submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_];
if (amd::IS_HIP) {
// HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
// Force CPU sync if there are pending operations on SDMA, until OS fences will be added
if (iQueue_->Type() == Pal::QueueTypeCompute) {
gpu_.WaitForIdleSdma();
}
}
// Submit command buffer to OS
Pal::Result result;
if (gpu_.rgpCaptureEna()) {
@@ -2695,9 +2704,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dev().rgpCaptureMgr()->PostDispatch(this);
}
// Mark the flag indicating if a dispatch is outstanding.
state_.hasPendingDispatch_ = true;
return true;
}
@@ -3953,12 +3959,4 @@ void* VirtualGPU::getOrCreateHostcallBuffer() {
return hostcallBuffer_;
}
void VirtualGPU::releaseGpuMemoryFence() {
if (isPendingDispatch() && amd::IS_HIP) {
WaitForIdleCompute();
// Reset the status.
state_.hasPendingDispatch_ = false;
}
}
} // namespace pal
+16 -9
View File
@@ -68,7 +68,7 @@ class VirtualGPU : public device::VirtualDevice {
Queue(const Queue&) = delete;
Queue& operator=(const Queue&) = delete;
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
static Queue* Create(VirtualGPU& gpu, //!< ROCCLR virtual GPU object
Pal::QueueType queueType, //!< PAL queue type
uint engineIdx, //!< Select particular engine index
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
@@ -78,7 +78,7 @@ class VirtualGPU : public device::VirtualDevice {
uint max_command_buffers //!< Number of allocated command buffers
);
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
uint max_command_buffers)
: lock_(nullptr),
iQueue_(nullptr),
@@ -177,7 +177,7 @@ class VirtualGPU : public device::VirtualDevice {
private:
void DumpMemoryReferences() const;
const VirtualGPU& gpu_; //!< OCL virtual GPU object
VirtualGPU& gpu_; //!< ROCCLR virtual GPU object
Pal::IDevice* iDev_; //!< PAL device
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
uint cmdBufIdCurrent_; //!< Current global command buffer ID
@@ -227,7 +227,6 @@ class VirtualGPU : public device::VirtualDevice {
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
uint imageBufferWrtBack_ : 1; //!< Enable image buffer write back
uint hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding
};
uint value_;
State() : value_(0) {}
@@ -562,14 +561,22 @@ class VirtualGPU : public device::VirtualDevice {
}
}
//! Waits for idle on SDMA engine
void WaitForIdleSdma() {
if (events_[SdmaEngine].isValid()) {
queues_[events_[SdmaEngine].engineId_]->waitForEvent(events_[SdmaEngine].id_);
events_[SdmaEngine].invalidate();
}
}
void* getOrCreateHostcallBuffer();
//! Waits on an outstanding kernel.
void releaseGpuMemoryFence();
//! Returns true if a dispatch is pending.
bool isPendingDispatch() const { return state_.hasPendingDispatch_; }
void releaseGpuMemoryFence() {
if (amd::IS_HIP) {
WaitForIdleCompute();
}
}
protected:
void profileEvent(EngineType engine, bool type) const;