From 0ec2d8847d538e25c866b4e92d9d930d4abcc284 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Mon, 9 Oct 2023 23:35:11 +0000 Subject: [PATCH] SWDEV-301667 - Track last used SDMA engine per queue - Track last SDMA engine per queue, this results in better scheduling - Reset last SDMA engine upon batch completion. That ensures we dont get blocked if the same engine is used by another concurrent copy Change-Id: Id53111980da7ee41d5c932fb44e4aab5b1e065a3 [ROCm/clr commit: bf8baeecb3024fc05f28a5dbafc3a45ca47d395d] --- projects/clr/rocclr/device/rocm/rocblit.cpp | 24 ++++++++++++------- .../clr/rocclr/device/rocm/rocvirtual.cpp | 6 ++++- .../clr/rocclr/device/rocm/rocvirtual.hpp | 4 ++++ 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 4c4549f5a3..cf24735d06 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -705,12 +705,18 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) { if (copyMask == 0) { - // Check SDMA engine status - status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Query copy engine status %x, free_engine mask 0x%x", - status, freeEngineMask); - // Return a mask with the rightmost bit set - copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1)); + // Check if there a recently used SDMA engine for the stream + copyMask = gpu().getLastUsedSdmaEngine(); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask); + if (copyMask == 0) { + // Check SDMA engine status + status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Query copy engine status %x, free_engine mask 0x%x", + status, freeEngineMask); + // Return a mask with the rightmost bit set + copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1)); + gpu().setLastUsedSdmaEngine(copyMask); + } } if (copyMask != 0 && status == HSA_STATUS_SUCCESS) { @@ -718,7 +724,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, hsa_amd_sdma_engine_id_t copyEngine = static_cast(copyMask); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Async Copy on copy_engine=%x, dst=0x%zx, src=0x%zx, " + "HSA Async Copy on copy_engine=0x%x, dst=0x%zx, src=0x%zx, " "size=%ld, wait_event=0x%zx, completion_signal=0x%zx", copyEngine, dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); @@ -2357,10 +2363,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, amd::ScopedLock k(lockXferOps_); bool result = false; constexpr size_t kFillImageThreshold = 256 * 256; - + // Use host fill if memory has direct access and image is small if (setup_.disableFillImage_ || - (gpuMem(memory).isHostMemDirectAccess() && + (gpuMem(memory).isHostMemDirectAccess() && (size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 95e07f3b79..697d2c0df6 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -214,6 +214,9 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) { // Update the batch, since signal is complete ts->gpu()->updateCommandsState(ts->command().GetBatchHead()); + // Reset last used SDMA engine mask + ts->gpu()->setLastUsedSdmaEngine(0); + // Reset API callback signal. It will release AQL queue and start commands processing if (callback_signal.handle != 0) { hsa_signal_subtract_relaxed(callback_signal, 1); @@ -1184,7 +1187,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, priority_(priority), copy_command_type_(0), fence_state_(Device::CacheState::kCacheStateInvalid), - fence_dirty_(false) + fence_dirty_(false), + lastUsedSdmaEngineMask_(0) { index_ = device.numOfVgpus_++; gpu_device_ = device.getBackendDevice(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index cc90df0e06..bb6d7f0c7e 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -419,6 +419,8 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } void resetFenceDirty() { fence_dirty_ = false; } + void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } + uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } // } roc OpenCL integration private: //! Dispatches a barrier with blocking HSA signals @@ -567,5 +569,7 @@ class VirtualGPU : public device::VirtualDevice { int fence_state_; //!< Fence scope //!< kUnknown/kFlushedToDevice/kFlushedToSystem bool fence_dirty_; //!< Fence modified flag + + std::atomic lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask }; }