From a3a775c94472379fbc06e9582fe0dcdf888ce06d Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Fri, 16 Aug 2024 15:23:19 +0000 Subject: [PATCH] Separate AsyncEventsLoop into two separate threads This fixes an issue for missing HW events when out of HW events. We cannot determine whether a HW event has occurred unless we call the underlying drivers with hsaKmtWaitOnMultipleEvents_Ext. Previous logic in Signal::WaitAny would switch to ACTIVE_WAIT state if we run out of hardware events (signal->EopEvent() == NULL) and this would cause the hsaKmtWaitOnMultipleEvents_Ext call to be skipped. But also, when we have some signals without hardware events, calling hsaKmtWaitOnMultipleEvents_Ext with a timeout of 0 so that we can poll for remaining signals adds overhead with an IOCTL call and may cause extra delay. Separating AsyncEventLoop into two separate threads so that: 1. We can have a new Signal::WaitAnyExceptions to wait for HW events This function can be simpler as it does not have to perform all the timer calculations because it is expected to be always waiting on hsaKmtWaitOnMultipleEvents_Ext through the lifetime of a process. 2. Signal::WaitAny does not need to have extra code to check for HW exceptions as it only needs to handle HSA_EVENTTYPE_SIGNAL events. It can also skip the calls to hsaKmtWaitOnMultipleEvents_Ext if needed. Change-Id: I52ba99fd6e483e0cb477b7931a0dcc03520aa523 Signed-off-by: David Yat Sin [ROCm/ROCR-Runtime commit: 88eaa834d027f420ec88550182b979685fefb094] --- .../runtime/hsa-runtime/core/inc/runtime.h | 12 +- .../runtime/hsa-runtime/core/inc/signal.h | 14 ++ .../hsa-runtime/core/runtime/runtime.cpp | 77 +++++++---- .../hsa-runtime/core/runtime/signal.cpp | 124 +++++++++++++++--- 4 files changed, 183 insertions(+), 44 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h index 6bbff2121c..b9af49cac3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h @@ -669,11 +669,15 @@ class Runtime { // Deprecated HSA Region API GPU (for legacy APU support only) Agent* region_gpu_; - AsyncEventsControl async_events_control_; + struct AsyncEventsInfo { + AsyncEventsControl control; + AsyncEvents events; + AsyncEvents new_events; + bool monitor_exceptions; + }; - AsyncEvents async_events_; - - AsyncEvents new_async_events_; + struct AsyncEventsInfo asyncSignals_; + struct AsyncEventsInfo asyncExceptions_; // System clock frequency. uint64_t sys_clock_freq_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h index 39e5321867..f960bc34b1 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h @@ -356,6 +356,20 @@ class Signal { uint64_t timeout_hint, hsa_wait_state_t wait_hint, hsa_signal_value_t* satisfying_value); + /// @brief Dedicated funtion to wait on signals that are not of type HSA_EVENTTYPE_SIGNAL + /// these events can only be received by calling the underlying driver (i.e via the hsaKmtWaitOnMultipleEvents_Ext + /// function call). We still need to have 1 signal of type HSA_EVENT_TYPE_SIGNAL attached to the list of signals + /// to be able to force hsaKmtWaitOnMultipleEvents_Ext to return. + /// @param signal_count Number of hsa_signals + /// @param hsa_signals Pointer to array of signals. All signals should have a valid EopEvent() + /// @param conds list of conditions + /// @param values list of values + /// @param satisfying_value value to be satisfied + /// @return index of signal that satisfies condition + static uint32_t WaitAnyExceptions(uint32_t signal_count, const hsa_signal_t* hsa_signals, + const hsa_signal_condition_t* conds, const hsa_signal_value_t* values, + hsa_signal_value_t* satisfying_value); + __forceinline bool IsType(rtti_t id) { return _IsA(id); } /// @brief Prevents the signal from being destroyed until the matching Release(). diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index ed136e4038..92a5eb7912 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -802,35 +802,44 @@ hsa_status_t Runtime::SetAsyncSignalHandler(hsa_signal_t signal, hsa_signal_value_t value, hsa_amd_signal_handler handler, void* arg) { - // Indicate that this signal is in use. - if (signal.handle != 0) hsa_signal_handle(signal)->Retain(); - ScopedAcquire scope_lock(&async_events_control_.lock); + struct AsyncEventsInfo* asyncInfo = &asyncSignals_; + + if (signal.handle != 0) { + // Indicate that this signal is in use. + hsa_signal_handle(signal)->Retain(); + + core::Signal* coreSignal = core::Signal::Convert(signal); + if (coreSignal->EopEvent() && coreSignal->EopEvent()->EventData.EventType != HSA_EVENTTYPE_SIGNAL) + asyncInfo = &asyncExceptions_; + } + + ScopedAcquire scope_lock(&asyncInfo->control.lock); // Lazy initializer - if (async_events_control_.async_events_thread_ == NULL) { + if (asyncInfo->control.async_events_thread_ == NULL) { // Create monitoring thread control signal - auto err = HSA::hsa_signal_create(0, 0, NULL, &async_events_control_.wake); + auto err = HSA::hsa_signal_create(0, 0, NULL, &asyncInfo->control.wake); if (err != HSA_STATUS_SUCCESS) { assert(false && "Asyncronous events control signal creation error."); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - async_events_.PushBack(async_events_control_.wake, HSA_SIGNAL_CONDITION_NE, - 0, NULL, NULL); + asyncInfo->events.PushBack(asyncInfo->control.wake, HSA_SIGNAL_CONDITION_NE, + 0, NULL, NULL); // Start event monitoring thread - async_events_control_.exit = false; - async_events_control_.async_events_thread_ = - os::CreateThread(AsyncEventsLoop, NULL); - if (async_events_control_.async_events_thread_ == NULL) { + asyncInfo->control.exit = false; + asyncInfo->control.async_events_thread_ = + os::CreateThread(AsyncEventsLoop, asyncInfo); + if (asyncInfo->control.async_events_thread_ == NULL) { assert(false && "Asyncronous events thread creation error."); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } } - new_async_events_.PushBack(signal, cond, value, handler, arg); + asyncInfo->new_events.PushBack(signal, cond, value, handler, arg); - hsa_signal_handle(async_events_control_.wake)->StoreRelease(1); + hsa_signal_handle(asyncInfo->control.wake)->StoreRelease(1); return HSA_STATUS_SUCCESS; } @@ -1546,18 +1555,35 @@ hsa_status_t Runtime::IPCDetach(void* ptr) { return HSA_STATUS_SUCCESS; } -void Runtime::AsyncEventsLoop(void*) { - auto& async_events_control_ = runtime_singleton_->async_events_control_; - auto& async_events_ = runtime_singleton_->async_events_; - auto& new_async_events_ = runtime_singleton_->new_async_events_; +void Runtime::AsyncEventsLoop(void* _eventsInfo) { + struct AsyncEventsInfo* eventsInfo = reinterpret_cast(_eventsInfo); + + auto& async_events_control_ = eventsInfo->control; + auto& async_events_ = eventsInfo->events; + auto& new_async_events_ = eventsInfo->new_events; while (!async_events_control_.exit) { // Wait for a signal hsa_signal_value_t value; - uint32_t index = AMD::hsa_amd_signal_wait_any( - uint32_t(async_events_.Size()), &async_events_.signal_[0], - &async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1), - HSA_WAIT_STATE_BLOCKED, &value); + uint32_t index = 0; + + if (eventsInfo->monitor_exceptions) { + index = Signal::WaitAnyExceptions( + uint32_t(async_events_.Size()), + &async_events_.signal_[0], + &async_events_.cond_[0], + &async_events_.value_[0], + &value); + } else { + index = AMD::hsa_amd_signal_wait_any( + uint32_t(async_events_.Size()), + &async_events_.signal_[0], + &async_events_.cond_[0], + &async_events_.value_[0], + uint64_t(-1), + HSA_WAIT_STATE_BLOCKED, + &value); + } // Reset the control signal if (index == 0) { @@ -1922,7 +1948,11 @@ Runtime::Runtime() hw_exception_event_(nullptr), hw_exception_signal_(nullptr), ref_count_(0), - kfd_version{} {} + kfd_version{} { + + asyncSignals_.monitor_exceptions = false; + asyncExceptions_.monitor_exceptions = true; +} hsa_status_t Runtime::Load() { os::cpuid_t cpuinfo; @@ -2000,7 +2030,8 @@ void Runtime::Unload() { std::for_each(disabled_gpu_agents_.begin(), disabled_gpu_agents_.end(), DeleteObject()); disabled_gpu_agents_.clear(); - async_events_control_.Shutdown(); + asyncSignals_.control.Shutdown(); + asyncExceptions_.control.Shutdown(); if (vm_fault_signal_ != nullptr) { vm_fault_signal_->DestroySignal(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp index 065f8ccc9f..e8fbe76993 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp @@ -255,23 +255,8 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals, while (true) { // Cannot mwaitx - polling multiple signals for (uint32_t i = 0; i < signal_count; i++) { - if (!signals[i]->IsValid()) return uint32_t(-1); - - // Handling special event. - if (signals[i]->EopEvent() != NULL) { - const HSA_EVENTTYPE event_type = - signals[i]->EopEvent()->EventData.EventType; - if (event_type == HSA_EVENTTYPE_MEMORY) { - const HsaMemoryAccessFault& fault = - signals[i]->EopEvent()->EventData.EventData.MemoryAccessFault; - if (fault.Flags == HSA_EVENTID_MEMORY_FATAL_PROCESS) { - return i; - } - } else if (event_type == HSA_EVENTTYPE_HW_EXCEPTION) { - const HsaHwException& exception = signals[i]->EopEvent()->EventData.EventData.HwException; - if (exception.MemoryLost) return i; - } - } + if (!signals[i]->IsValid()) + return uint32_t(-1); value = atomic::Load(&signals[i]->signal_.value, std::memory_order_relaxed); @@ -325,6 +310,111 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals, } } +/* + * Special handler to wait listen for exceptions from underlying driver. + */ +uint32_t Signal::WaitAnyExceptions(uint32_t signal_count, const hsa_signal_t* hsa_signals, + const hsa_signal_condition_t* conds, const hsa_signal_value_t* values, + hsa_signal_value_t* satisfying_value) { + + uint32_t wait_ms = uint32_t(-1); + hsa_signal_handle* signals = + reinterpret_cast(const_cast(hsa_signals)); + + for (uint32_t i = 0; i < signal_count; i++) signals[i]->Retain(); + + MAKE_SCOPE_GUARD([&]() { + for (uint32_t i = 0; i < signal_count; i++) signals[i]->Release(); + }); + + uint32_t prior = 0; + for (uint32_t i = 0; i < signal_count; i++) prior = Max(prior, signals[i]->waiting_++); + + + MAKE_SCOPE_GUARD([&]() { + for (uint32_t i = 0; i < signal_count; i++) signals[i]->waiting_--; + }); + + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_event_age) + // Allow only the first waiter to sleep. Without event age tracking, + // race condition can cause some threads to sleep without wakeup since missing interrupt. + if (prior != 0) wait_ms = 0; + + HsaEvent** evts = new HsaEvent* [signal_count]; + MAKE_SCOPE_GUARD([&]() { delete[] evts; }); + + uint32_t unique_evts = 0; + + for (uint32_t i = 0; i < signal_count; i++) { + assert(signals[i]->EopEvent() != NULL); + evts[i] = signals[i]->EopEvent(); + } + + std::sort(evts, evts + signal_count); + HsaEvent** end = std::unique(evts, evts + signal_count); + unique_evts = uint32_t(end - evts); + + uint64_t event_age[unique_evts]; + memset(event_age, 0, unique_evts * sizeof(uint64_t)); + if (core::Runtime::runtime_singleton_->KfdVersion().supports_event_age) + for (uint32_t i = 0; i < unique_evts; i++) + event_age[i] = 1; + + int64_t value; + + bool condition_met = false; + while (true) { + // Cannot mwaitx - polling multiple signals + + for (uint32_t i = 0; i < signal_count; i++) { + if (!signals[i]->IsValid()) + return uint32_t(-1); + + const HSA_EVENTTYPE event_type = signals[i]->EopEvent()->EventData.EventType; + if (event_type == HSA_EVENTTYPE_MEMORY) { + const HsaMemoryAccessFault& fault = + signals[i]->EopEvent()->EventData.EventData.MemoryAccessFault; + if (fault.Flags == HSA_EVENTID_MEMORY_FATAL_PROCESS) return i; + } else if (event_type == HSA_EVENTTYPE_HW_EXCEPTION) { + const HsaHwException& exception = + signals[i]->EopEvent()->EventData.EventData.HwException; + if (exception.MemoryLost) return i; + } + + value = atomic::Load(&signals[i]->signal_.value, std::memory_order_relaxed); + + switch (conds[i]) { + case HSA_SIGNAL_CONDITION_EQ: { + condition_met = (value == values[i]); + break; + } + case HSA_SIGNAL_CONDITION_NE: { + condition_met = (value != values[i]); + break; + } + case HSA_SIGNAL_CONDITION_GTE: { + condition_met = (value >= values[i]); + break; + } + case HSA_SIGNAL_CONDITION_LT: { + condition_met = (value < values[i]); + break; + } + default: { + return uint32_t(-1); + } + } + if (condition_met) { + if (satisfying_value != NULL) *satisfying_value = value; + // Some other signal in the list satisfied condition + return i; + } + } + + hsaKmtWaitOnMultipleEvents_Ext(evts, unique_evts, false, wait_ms, event_age); + } //while +} + SignalGroup::SignalGroup(uint32_t num_signals, const hsa_signal_t* hsa_signals) : count(num_signals) { if (count != 0) {