SWDEV-363074 - Clean-up sync between SDMA and compute
HIP can't rely on the resource tracking, used in OCL and requires different explicit sync.
Make sure ROCCLR syncs compute only when SDMA is used and vise versa.
The new logic will allow to enable CPDMA without unnecessary waits.
Change-Id: Ib9d1788cfd5afa5ea2fec4c96a37d8b9c4d0059d
[ROCm/clr commit: ff6b4db70b]
This commit is contained in:
@@ -47,8 +47,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const {
|
||||
bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** xferBuf,
|
||||
size_t origin, size_t& offset, size_t& totalSize,
|
||||
size_t xferSize) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
amd::Coord3D dst(0, 0, 0);
|
||||
size_t tmpSize;
|
||||
uint idxWrite = 0;
|
||||
@@ -120,11 +118,10 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory**
|
||||
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBuffer_ ||
|
||||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
|
||||
} else {
|
||||
size_t srcSize = size[0];
|
||||
@@ -210,11 +207,10 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBufferRect_ ||
|
||||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
|
||||
} else {
|
||||
Memory& xferBuf = dev().xferRead().acquire();
|
||||
@@ -293,7 +289,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
|
||||
} else {
|
||||
chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize());
|
||||
chunkSize = std::max(chunkSize, 64 * Ki);
|
||||
flushDMA = true;
|
||||
flushDMA = (xferSize > chunkSize);
|
||||
}
|
||||
size_t srcOffset = 0;
|
||||
uint32_t flags = Resource::NoWait;
|
||||
@@ -332,13 +328,12 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
|
||||
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access or it's persistent
|
||||
if (setup_.disableWriteBuffer_ ||
|
||||
(gpuMem(dstMemory).isHostMemDirectAccess() &&
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
gpuMem(dstMemory).isPersistentDirectMap()) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
|
||||
} else {
|
||||
size_t dstSize = size[0];
|
||||
@@ -401,12 +396,12 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
}
|
||||
}
|
||||
|
||||
if (dstSize != 0) {
|
||||
Memory& xferBuf = gpu().xferWrite().Acquire(dstSize);
|
||||
|
||||
while (dstSize > 0) {
|
||||
auto xfer_size = std::min(dstSize, gpu().xferWrite().MaxSize());
|
||||
Memory& xferBuf = gpu().xferWrite().Acquire(xfer_size);
|
||||
// Write memory using a staged resource
|
||||
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize,
|
||||
dstSize)) {
|
||||
xfer_size)) {
|
||||
LogError("DmaBlitManager::writeBuffer failed!");
|
||||
return false;
|
||||
}
|
||||
@@ -422,13 +417,12 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
const amd::BufferRect& hostRect,
|
||||
const amd::BufferRect& bufRect, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access or it's persistent
|
||||
if (setup_.disableWriteBufferRect_ ||
|
||||
(dstMemory.isHostMemDirectAccess() &&
|
||||
(gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
|
||||
gpuMem(dstMemory).isPersistentDirectMap()) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
|
||||
} else {
|
||||
Memory& xferBuf = gpu().xferWrite().Acquire(std::min(gpu().xferWrite().MaxSize(), size[0]));
|
||||
@@ -481,7 +475,6 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
size_t rowPitch, size_t slicePitch, bool entire) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableWriteImage_) {
|
||||
return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
|
||||
entire);
|
||||
@@ -497,11 +490,10 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
|
||||
bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableCopyBuffer_ ||
|
||||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
|
||||
!dev().settings().apuSystem_ && gpuMem(dstMemory).isHostMemDirectAccess())) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size);
|
||||
} else {
|
||||
return gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
|
||||
@@ -513,11 +505,10 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
|
||||
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableCopyBufferRect_ ||
|
||||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
|
||||
gpuMem(dstMemory).isHostMemDirectAccess())) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
|
||||
} else {
|
||||
size_t srcOffset;
|
||||
@@ -591,10 +582,8 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
|
||||
const amd::Coord3D& size, bool entire, size_t rowPitch,
|
||||
size_t slicePitch) const {
|
||||
bool result = false;
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
|
||||
if (setup_.disableCopyImageToBuffer_) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
|
||||
entire, rowPitch, slicePitch);
|
||||
} else {
|
||||
@@ -604,6 +593,7 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
|
||||
|
||||
// Check if a HostBlit transfer is required
|
||||
if (completeOperation_ && !result) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
|
||||
entire, rowPitch, slicePitch);
|
||||
}
|
||||
@@ -617,10 +607,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
|
||||
const amd::Coord3D& size, bool entire, size_t rowPitch,
|
||||
size_t slicePitch) const {
|
||||
bool result = false;
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
|
||||
if (setup_.disableCopyBufferToImage_) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
|
||||
entire, rowPitch, slicePitch);
|
||||
} else {
|
||||
@@ -630,6 +618,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
|
||||
|
||||
// Check if a HostBlit transfer is required
|
||||
if (completeOperation_ && !result) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size,
|
||||
entire, rowPitch, slicePitch);
|
||||
}
|
||||
|
||||
@@ -1430,6 +1430,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
|
||||
// Make sure compute is done before CP DMA start
|
||||
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
|
||||
} else {
|
||||
gpu.releaseGpuMemoryFence();
|
||||
gpu.engineID_ = SdmaEngine;
|
||||
}
|
||||
|
||||
|
||||
@@ -61,7 +61,8 @@ uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType
|
||||
return allocedQueues;
|
||||
}
|
||||
|
||||
VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType,
|
||||
// ================================================================================================
|
||||
VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType queueType,
|
||||
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
|
||||
uint rtCU, amd::CommandQueue::Priority priority,
|
||||
uint64_t residency_limit, uint max_command_buffers) {
|
||||
@@ -341,6 +342,7 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b
|
||||
palDoppRefs_.push_back(doppRef);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::Queue::flush() {
|
||||
if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
|
||||
if (Pal::Result::Success !=
|
||||
@@ -381,6 +383,13 @@ bool VirtualGPU::Queue::flush() {
|
||||
submitInfo.fenceCount = 1;
|
||||
submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_];
|
||||
|
||||
if (amd::IS_HIP) {
|
||||
// HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
|
||||
// Force CPU sync if there are pending operations on SDMA, until OS fences will be added
|
||||
if (iQueue_->Type() == Pal::QueueTypeCompute) {
|
||||
gpu_.WaitForIdleSdma();
|
||||
}
|
||||
}
|
||||
// Submit command buffer to OS
|
||||
Pal::Result result;
|
||||
if (gpu_.rgpCaptureEna()) {
|
||||
@@ -2695,9 +2704,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
dev().rgpCaptureMgr()->PostDispatch(this);
|
||||
}
|
||||
|
||||
// Mark the flag indicating if a dispatch is outstanding.
|
||||
state_.hasPendingDispatch_ = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3953,12 +3959,4 @@ void* VirtualGPU::getOrCreateHostcallBuffer() {
|
||||
return hostcallBuffer_;
|
||||
}
|
||||
|
||||
void VirtualGPU::releaseGpuMemoryFence() {
|
||||
if (isPendingDispatch() && amd::IS_HIP) {
|
||||
WaitForIdleCompute();
|
||||
// Reset the status.
|
||||
state_.hasPendingDispatch_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
|
||||
@@ -68,7 +68,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
Queue(const Queue&) = delete;
|
||||
Queue& operator=(const Queue&) = delete;
|
||||
|
||||
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
|
||||
static Queue* Create(VirtualGPU& gpu, //!< ROCCLR virtual GPU object
|
||||
Pal::QueueType queueType, //!< PAL queue type
|
||||
uint engineIdx, //!< Select particular engine index
|
||||
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
|
||||
@@ -78,7 +78,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint max_command_buffers //!< Number of allocated command buffers
|
||||
);
|
||||
|
||||
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
|
||||
Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
|
||||
uint max_command_buffers)
|
||||
: lock_(nullptr),
|
||||
iQueue_(nullptr),
|
||||
@@ -177,7 +177,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
private:
|
||||
void DumpMemoryReferences() const;
|
||||
const VirtualGPU& gpu_; //!< OCL virtual GPU object
|
||||
VirtualGPU& gpu_; //!< ROCCLR virtual GPU object
|
||||
Pal::IDevice* iDev_; //!< PAL device
|
||||
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
|
||||
uint cmdBufIdCurrent_; //!< Current global command buffer ID
|
||||
@@ -227,7 +227,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
|
||||
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
|
||||
uint imageBufferWrtBack_ : 1; //!< Enable image buffer write back
|
||||
uint hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding
|
||||
};
|
||||
uint value_;
|
||||
State() : value_(0) {}
|
||||
@@ -562,14 +561,22 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
}
|
||||
}
|
||||
|
||||
//! Waits for idle on SDMA engine
|
||||
void WaitForIdleSdma() {
|
||||
if (events_[SdmaEngine].isValid()) {
|
||||
queues_[events_[SdmaEngine].engineId_]->waitForEvent(events_[SdmaEngine].id_);
|
||||
events_[SdmaEngine].invalidate();
|
||||
}
|
||||
}
|
||||
|
||||
void* getOrCreateHostcallBuffer();
|
||||
|
||||
//! Waits on an outstanding kernel.
|
||||
void releaseGpuMemoryFence();
|
||||
|
||||
//! Returns true if a dispatch is pending.
|
||||
bool isPendingDispatch() const { return state_.hasPendingDispatch_; }
|
||||
|
||||
void releaseGpuMemoryFence() {
|
||||
if (amd::IS_HIP) {
|
||||
WaitForIdleCompute();
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
void profileEvent(EngineType engine, bool type) const;
|
||||
|
||||
Reference in New Issue
Block a user