From 6604accdb3a26b7d9935e212606909796f780a48 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Wed, 4 Dec 2024 19:14:38 -0500 Subject: [PATCH] SWDEV-501757 - Use signals without interrupts In active wait mode use signals without interrupts by default and switch to the interrupts only if a callback is required. Change-Id: Ibcde8f7d44c70f8fb8fa5e0a7fdd8b08a2982a8e [ROCm/clr commit: f4b9d3b7bd3267de9d77b15b93223dff9472be73] --- projects/clr/rocclr/device/rocm/rocdevice.hpp | 7 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 119 ++++++++++++------ .../clr/rocclr/device/rocm/rocvirtual.hpp | 7 ++ 3 files changed, 94 insertions(+), 39 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 1535ee01de..052cda1c97 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -82,13 +82,14 @@ public: Timestamp* ts_; //!< Timestamp object associated with the signal HwQueueEngine engine_; //!< Engine used with this signal amd::Monitor lock_; //!< Signal lock for update - bool isPacketDispatch_; //!< True if the packet associated with the signal is dispatch typedef union { struct { uint32_t done_ : 1; //!< True if signal is done uint32_t forceHostWait_ : 1; //!< Force Host Wait for dependency signals - uint32_t reserved_ : 30; + uint32_t isPacketDispatch_: 1; //!< True if the packet associated with the signal is dispatch + uint32_t interrupt_ : 1; //!< True if the signal will trigger an interrupt + uint32_t reserved_ : 28; }; uint32_t data_; } Flags; @@ -99,9 +100,9 @@ public: : ts_(nullptr) , engine_(HwQueueEngine::Compute) , lock_(true) /* Signal Ops Lock */ - , isPacketDispatch_(false) { signal_.handle = 0; + flags_.data_ = 0; flags_.done_ = true; flags_.forceHostWait_ = true; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 7d52ce0cf9..7ebb883efa 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -158,7 +158,7 @@ void Timestamp::checkGpuTime() { start = std::min(time.start, start); end = std::max(time.end, end); - if ((command().type() == CL_COMMAND_TASK) && (it->isPacketDispatch_ == true)) { + if ((command().type() == CL_COMMAND_TASK) && (it->flags_.isPacketDispatch_ == true)) { static_cast(command()).addTimestamps(time.start, time.end); } @@ -361,27 +361,64 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() { CpuWaitForSignal(signal); signal->release(); } + // Destroy all extra signals. Note: these signals must be idle already + while (signal_pool_.size() != 0) { + signal_pool_.top()->release(); + signal_pool_.pop(); + } + while (signal_pool_irq_.size() != 0) { + signal_pool_irq_.top()->release(); + signal_pool_irq_.pop(); + } } // ================================================================================================ -bool VirtualGPU::HwQueueTracker::Create() { - uint kSignalListSize = ROC_SIGNAL_POOL_SIZE; - - signal_list_.resize(kSignalListSize); - +bool VirtualGPU::HwQueueTracker::CreateSignal(ProfilingSignal* signal, bool interrupt) const { hsa_agent_t agent = gpu_.gpu_device(); const Settings& settings = gpu_.dev().settings(); hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent; uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1; + // MT path will still have interrupts to avoid extra polling in the queue thread. + // Also runtime will still use interrupts if active wait was disabled + interrupt |= !AMD_DIRECT_DISPATCH || !gpu_.dev().ActiveWait(); + // Check if the interrupt was requested for the signal + if (interrupt) { + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_)) { + return false; + } + signal->flags_.interrupt_ = true; + } else { + if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(0, num_agents, agents, + HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal->signal_)) { + return false; + } + } + return true; +} + +// ================================================================================================ +bool VirtualGPU::HwQueueTracker::Create() { + const uint kSignalListSize = ROC_SIGNAL_POOL_SIZE; + signal_list_.resize(kSignalListSize); for (uint i = 0; i < kSignalListSize; ++i) { std::unique_ptr signal(new ProfilingSignal()); - if ((signal == nullptr) || - (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) { + if ((signal == nullptr) || !CreateSignal(signal.get())) { return false; } signal_list_[i] = signal.release(); } + // Add extra signals with the interrupts for the callbacks + if (AMD_DIRECT_DISPATCH && gpu_.dev().ActiveWait()) { + for (uint32_t i = 0; i < 5; ++i) { + std::unique_ptr signal(new ProfilingSignal()); + constexpr bool kEnableInterrupt = true; + if ((signal == nullptr) || !CreateSignal(signal.get(), kEnableInterrupt)) { + return false; + } + signal_pool_irq_.push(signal.release()); + } + } return true; } @@ -395,19 +432,12 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // If GPU is still busy with processing, then add more signals to avoid more frequent stalls if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) { std::unique_ptr signal(new ProfilingSignal()); - if (signal != nullptr) { - hsa_agent_t agent = gpu_.gpu_device(); - const Settings& settings = gpu_.dev().settings(); - hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent; - uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1; - - if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) { - // Find valid new index - ++current_id_ %= signal_list_.size(); - // Insert the new signal into the current slot and ignore any wait - signal_list_.insert(signal_list_.begin() + current_id_, signal.release()); - new_signal = true; - } + if ((signal != nullptr) && CreateSignal(signal.get())) { + // Find valid new index + ++current_id_ %= signal_list_.size(); + // Insert the new signal into the current slot and ignore any wait + signal_list_.insert(signal_list_.begin() + current_id_, signal.release()); + new_signal = true; } } @@ -429,22 +459,43 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // The signal was assigned to the global marker's event, hence runtime can't reuse it // and needs a new signal std::unique_ptr signal(new ProfilingSignal()); - if (signal != nullptr) { - hsa_agent_t agent = gpu_.gpu_device(); - const Settings& settings = gpu_.dev().settings(); - hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent; - uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1; - - if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) { + if ((signal != nullptr) && CreateSignal(signal.get())) { signal_list_[current_id_]->release(); signal_list_[current_id_] = signal.release(); - } else { - assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!"); - } } else { assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!"); } } + + bool enqueHandler = false; + if (AMD_DIRECT_DISPATCH) { + if (ts != nullptr) { + enqueHandler = (ts->command().Callback() != nullptr || + ts->command().GetBatchHead() != nullptr ) && + !ts->command().CpuWaitRequested(); + } + // Check if the signal doesn't match the requested one. + // Note: runtime needs the interrupts for the callbacks in DD mode + if ((signal_list_[current_id_]->flags_.interrupt_ != enqueHandler) && gpu_.dev().ActiveWait()) { + // Use different stacks if interrupt is required + auto& stack_pop = (enqueHandler) ? signal_pool_irq_ : signal_pool_; + auto& stack_push = (enqueHandler) ? signal_pool_ : signal_pool_irq_; + + // Check if a free signal in the pop stack isn't available + if (stack_pop.empty()) { + std::unique_ptr signal(new ProfilingSignal()); + if ((signal != nullptr) && CreateSignal(signal.get(), enqueHandler)) { + stack_pop.push(signal.release()); + } + } + // Make sure a free signal exists and replace it in the current slot + if (!stack_pop.empty()) { + stack_push.push(signal_list_[current_id_]); + signal_list_[current_id_] = stack_pop.top(); + stack_pop.pop(); + } + } + } ProfilingSignal* prof_signal = signal_list_[current_id_]; // Reset the signal and return hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val); @@ -458,10 +509,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( prof_signal->ts_ = ts; ts->AddProfilingSignal(prof_signal); if (AMD_DIRECT_DISPATCH) { - bool enqueHandler = false; - enqueHandler = (ts->command().Callback() != nullptr || - ts->command().GetBatchHead() != nullptr ) && - !ts->command().CpuWaitRequested(); // If direct dispatch is enabled and the batch head isn't null, then it's a marker and // requires the batch update upon HSA signal completion if (enqueHandler) { @@ -884,7 +931,7 @@ bool VirtualGPU::dispatchGenericAqlPacket( } ProfilingSignal* current_signal = Barriers().GetLastSignal(); - current_signal->isPacketDispatch_ = true; + current_signal->flags_.isPacketDispatch_ = true; } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index ce252991f9..a6d41d69a8 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -31,6 +31,7 @@ #include "hsa/hsa_ven_amd_aqlprofile.h" #include "rocsched.hpp" #include "device/device.hpp" +#include namespace amd::roc { class Device; @@ -297,7 +298,11 @@ class VirtualGPU : public device::VirtualDevice { sdma_profiling_ = profile; hsa_amd_profiling_async_copy_enable(profile); } + private: + //! Creates HSA signal with the specified scope + bool CreateSignal(ProfilingSignal* signal, bool interrupt = false) const; + //! Wait for the next active signal void WaitNext() { size_t next = (current_id_ + 1) % signal_list_.size(); @@ -309,6 +314,8 @@ class VirtualGPU : public device::VirtualDevice { bool CpuWaitForSignal(ProfilingSignal* signal); HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations + std::stack signal_pool_irq_; //!< The pool of free signals with interrupts + std::stack signal_pool_; //!< The pool of free signals without interrupt std::vector signal_list_; //!< The pool of all signals for processing size_t current_id_ = 0; //!< Last submitted signal bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled