From cc229f251f514c70f4c1e0a569fb484babdd4dce Mon Sep 17 00:00:00 2001 From: Jimbo Xie Date: Tue, 7 Jan 2025 18:30:49 -0500 Subject: [PATCH] SWDEV-504383 - Cleaned up kForcedTimeout10us and removed IsHwEventReadyForcedWait Also removed active_wait_timeout Change-Id: I7a429f003c09a4df267b5c0983050704260094c6 [ROCm/clr commit: 4872b420c95e32a8f362672769597dc809dbfed6] --- projects/clr/rocclr/device/device.hpp | 6 ---- projects/clr/rocclr/device/rocm/rocblit.cpp | 30 ++----------------- projects/clr/rocclr/device/rocm/rocblit.hpp | 2 -- projects/clr/rocclr/device/rocm/rocdevice.cpp | 12 -------- projects/clr/rocclr/device/rocm/rocdevice.hpp | 6 +--- .../clr/rocclr/device/rocm/rocvirtual.cpp | 23 +++++--------- .../clr/rocclr/device/rocm/rocvirtual.hpp | 22 ++------------ 7 files changed, 14 insertions(+), 87 deletions(-) diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 92b38dcc1f..d12f896d9b 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1981,12 +1981,6 @@ class Device : public RuntimeObject { return false; }; - // Returns the status of HW event, associated with amd::Event - virtual bool IsHwEventReadyForcedWait( - const amd::Event& event) const { //!< AMD event for HW status validation - return false; - }; - virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; virtual const uint32_t getPreferredNumaNode() const { return 0; } diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 108729cebe..d335dcb155 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -518,14 +518,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, } gpu().Barriers().SetActiveEngine(engine); - // Check if host wait has to be forced - bool forceHostWait = forceHostWaitFunc(size); - - constexpr bool kIgnoreHostWait = false; - // Ignore waiting on any previous kernel dispatch and queue a signal to ROCr copy api instead - auto wait_events = gpu().Barriers().WaitingSignal(engine, kIgnoreHostWait); - hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp(), - forceHostWait); + auto wait_events = gpu().Barriers().WaitingSignal(engine); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) { if (copyMask == 0) { @@ -2635,26 +2629,6 @@ amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize, return amdMemory; } -bool DmaBlitManager::forceHostWaitFunc(size_t copy_size) const { - // 10us wait is true for all other targets. - bool forceHostWait = true; - // Based on the profiled results, do not wait for copy size > 24 KB. - static constexpr size_t kGfx90aCopyThreshold = 24; - - if ((dev().isa().versionMajor() == 9 && dev().isa().versionMinor() == 0 - && dev().isa().versionStepping() == 10) && (copy_size >= kGfx90aCopyThreshold * Ki)) { - // Check if this is gfx90a and restrict small copy to 24K. - forceHostWait = false; - } else if ((dev().isa().versionMajor() == 9) && (dev().isa().versionMinor() == 4) - && (dev().isa().versionStepping() == 0 || dev().isa().versionStepping() == 1 - || dev().isa().versionStepping() == 2)) { - // for gfx940, gfx941, gfx942, dependency signal resolution is fast, so no Host wait at all. - forceHostWait = false; - } - - return forceHostWait; -} - Memory* KernelBlitManager::createView(const Memory& parent, cl_image_format format, cl_mem_flags flags) const { assert((parent.owner()->asBuffer() == nullptr) && "View supports images only"); diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index 1d9d0e670d..7e92feb574 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -260,8 +260,6 @@ class DmaBlitManager : public device::HostBlitManager { bool hostToDev, //!< True if data is copied from H2D amd::CopyMetadata& copyMetadata //!< Memory copy MetaData ) const; - - bool forceHostWaitFunc(size_t copy_size) const; }; //! Kernel Blit Manager diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 71fae55468..e2f4eb1740 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2914,18 +2914,6 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI return result; } -// ================================================================================================ -bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const { - void* hw_event = - (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); - if (hw_event == nullptr) { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); - return false; - } - static constexpr bool Timeout = true; - return WaitForSignal(reinterpret_cast(hw_event)->signal_, false, true); -} - // ================================================================================================ bool Device::IsHwEventReady(const amd::Event& event, bool wait, uint32_t hip_event_flags) const { void* hw_event = diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index db602bbb66..719af1ea9b 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -86,10 +86,9 @@ public: typedef union { struct { uint32_t done_ : 1; //!< True if signal is done - uint32_t forceHostWait_ : 1; //!< Force Host Wait for dependency signals uint32_t isPacketDispatch_: 1; //!< True if the packet, used with the signal, is dispatch uint32_t interrupt_ : 1; //!< True if the signal will trigger an interrupt - uint32_t reserved_ : 28; + uint32_t reserved_ : 29; }; uint32_t data_; } Flags; @@ -104,7 +103,6 @@ public: signal_.handle = 0; flags_.data_ = 0; flags_.done_ = true; - flags_.forceHostWait_ = true; } virtual ~ProfilingSignal(); @@ -298,7 +296,6 @@ class NullDevice : public amd::Device { return false; } - bool IsHwEventReadyForcedWait(const amd::Event& event) const override { return false; } void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const override{}; void ReleaseGlobalSignal(void* signal) const override {} @@ -498,7 +495,6 @@ class Device : public NullDevice { virtual bool IsHwEventReady(const amd::Event& event, bool wait = false, uint32_t hip_event_flags = 0) const; - virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 938c4fcc5d..cd517ca864 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -422,7 +422,7 @@ bool VirtualGPU::HwQueueTracker::Create() { // ================================================================================================ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( - hsa_signal_value_t init_val, Timestamp* ts, bool forceHostWait) { + hsa_signal_value_t init_val, Timestamp* ts) { bool new_signal = false; // Peep signal +2 ahead to see if its done @@ -502,7 +502,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( prof_signal->flags_.done_ = false; prof_signal->engine_ = engine_; prof_signal->flags_.isPacketDispatch_ = false; - prof_signal->flags_.forceHostWait_ = forceHostWait; if (ts != 0) { // Save HSA signal earlier to make sure the possible callback will have a valid // value for processing @@ -548,8 +547,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( } // ================================================================================================ -std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine, - bool forceHostWait) { +std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) { bool explicit_wait = false; // Reset all current waiting signals waiting_signals_.clear(); @@ -595,17 +593,12 @@ std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi // Early signal status check if (hsa_signal_load_relaxed(external_signals_[i]->signal_) > 0) { const Settings& settings = gpu_.dev().settings(); - // Actively wait on CPU to avoid extra overheads of signal tracking on GPU. - // For small copies set forced wait - if (!WaitForSignal(external_signals_[i]->signal_, false, forceHostWait ? - external_signals_[i]->flags_.forceHostWait_ : false)) { - if (settings.cpu_wait_for_signal_) { - // Wait on CPU for completion if requested - CpuWaitForSignal(external_signals_[i]); - } else { - // Add HSA signal for tracking on GPU - waiting_signals_.push_back(external_signals_[i]->signal_); - } + if (settings.cpu_wait_for_signal_) { + // Wait on CPU for completion if requested + CpuWaitForSignal(external_signals_[i]); + } else { + // Add HSA signal for tracking on GPU + waiting_signals_.push_back(external_signals_[i]->signal_); } } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 15e89aaba7..76f9b8ecb7 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -46,24 +46,12 @@ constexpr static hsa_signal_value_t kInitSignalValueOne = 1; constexpr static uint64_t kTimeout100us = 100 * K; constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); -// Active wait time out incase same sdma engine is used again, -// then just wait instead of adding dependency wait signal. -constexpr static uint64_t kForcedTimeout10us = 10; - -template -inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) { +inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) { if (hsa_signal_load_relaxed(signal) > 0) { uint64_t timeout = kTimeout100us; if (active_wait) { timeout = kUnlimitedWait; } - if (active_wait_timeout) { - // If forced wait is set, then wait for 10us, else dont wait. (ns * K = us) - timeout = (forced_wait ? kForcedTimeout10us : ROC_ACTIVE_WAIT_TIMEOUT) * K; - if (timeout == 0) { - return false; - } - } ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns", signal.handle, timeout); @@ -71,9 +59,6 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool fo // Active wait with a timeout if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, timeout, HSA_WAIT_STATE_ACTIVE) != 0) { - if (active_wait_timeout) { - return false; - } ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host blocked wait for Signal = (0x%lx)", signal.handle); @@ -269,7 +254,7 @@ class VirtualGPU : public device::VirtualDevice { //! Finds a free signal for the upcomming operation hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne, - Timestamp* ts = nullptr, bool forceHostWait = true); + Timestamp* ts = nullptr); //! Wait for the curent active signal. Can idle the queue bool WaitCurrent() { @@ -282,8 +267,7 @@ class VirtualGPU : public device::VirtualDevice { HwQueueEngine GetActiveEngine() const { return engine_; } //! Returns the last submitted signal for a wait - std::vector& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute, - bool forceHostWait = true); + std::vector& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute); //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure. void ResetCurrentSignal();