SWDEV-501757 - Use signals without interrupts

In active wait mode use signals without interrupts by default and switch
to the interrupts only if a callback is required.

Change-Id: Ibcde8f7d44c70f8fb8fa5e0a7fdd8b08a2982a8e


[ROCm/clr commit: f4b9d3b7bd]
Этот коммит содержится в:
German Andryeyev
2024-12-04 19:14:38 -05:00
родитель 7346f3bd29
Коммит 6604accdb3
3 изменённых файлов: 94 добавлений и 39 удалений
+4 -3
Просмотреть файл
@@ -82,13 +82,14 @@ public:
Timestamp* ts_; //!< Timestamp object associated with the signal
HwQueueEngine engine_; //!< Engine used with this signal
amd::Monitor lock_; //!< Signal lock for update
bool isPacketDispatch_; //!< True if the packet associated with the signal is dispatch
typedef union {
struct {
uint32_t done_ : 1; //!< True if signal is done
uint32_t forceHostWait_ : 1; //!< Force Host Wait for dependency signals
uint32_t reserved_ : 30;
uint32_t isPacketDispatch_: 1; //!< True if the packet associated with the signal is dispatch
uint32_t interrupt_ : 1; //!< True if the signal will trigger an interrupt
uint32_t reserved_ : 28;
};
uint32_t data_;
} Flags;
@@ -99,9 +100,9 @@ public:
: ts_(nullptr)
, engine_(HwQueueEngine::Compute)
, lock_(true) /* Signal Ops Lock */
, isPacketDispatch_(false)
{
signal_.handle = 0;
flags_.data_ = 0;
flags_.done_ = true;
flags_.forceHostWait_ = true;
}
+83 -36
Просмотреть файл
@@ -158,7 +158,7 @@ void Timestamp::checkGpuTime() {
start = std::min(time.start, start);
end = std::max(time.end, end);
if ((command().type() == CL_COMMAND_TASK) && (it->isPacketDispatch_ == true)) {
if ((command().type() == CL_COMMAND_TASK) && (it->flags_.isPacketDispatch_ == true)) {
static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
}
@@ -361,27 +361,64 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
CpuWaitForSignal(signal);
signal->release();
}
// Destroy all extra signals. Note: these signals must be idle already
while (signal_pool_.size() != 0) {
signal_pool_.top()->release();
signal_pool_.pop();
}
while (signal_pool_irq_.size() != 0) {
signal_pool_irq_.top()->release();
signal_pool_irq_.pop();
}
}
// ================================================================================================
bool VirtualGPU::HwQueueTracker::Create() {
uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
signal_list_.resize(kSignalListSize);
bool VirtualGPU::HwQueueTracker::CreateSignal(ProfilingSignal* signal, bool interrupt) const {
hsa_agent_t agent = gpu_.gpu_device();
const Settings& settings = gpu_.dev().settings();
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
// MT path will still have interrupts to avoid extra polling in the queue thread.
// Also runtime will still use interrupts if active wait was disabled
interrupt |= !AMD_DIRECT_DISPATCH || !gpu_.dev().ActiveWait();
// Check if the interrupt was requested for the signal
if (interrupt) {
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
return false;
}
signal->flags_.interrupt_ = true;
} else {
if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(0, num_agents, agents,
HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal->signal_)) {
return false;
}
}
return true;
}
// ================================================================================================
bool VirtualGPU::HwQueueTracker::Create() {
const uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
signal_list_.resize(kSignalListSize);
for (uint i = 0; i < kSignalListSize; ++i) {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if ((signal == nullptr) ||
(HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
if ((signal == nullptr) || !CreateSignal(signal.get())) {
return false;
}
signal_list_[i] = signal.release();
}
// Add extra signals with the interrupts for the callbacks
if (AMD_DIRECT_DISPATCH && gpu_.dev().ActiveWait()) {
for (uint32_t i = 0; i < 5; ++i) {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
constexpr bool kEnableInterrupt = true;
if ((signal == nullptr) || !CreateSignal(signal.get(), kEnableInterrupt)) {
return false;
}
signal_pool_irq_.push(signal.release());
}
}
return true;
}
@@ -395,19 +432,12 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
// If GPU is still busy with processing, then add more signals to avoid more frequent stalls
if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if (signal != nullptr) {
hsa_agent_t agent = gpu_.gpu_device();
const Settings& settings = gpu_.dev().settings();
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
// Find valid new index
++current_id_ %= signal_list_.size();
// Insert the new signal into the current slot and ignore any wait
signal_list_.insert(signal_list_.begin() + current_id_, signal.release());
new_signal = true;
}
if ((signal != nullptr) && CreateSignal(signal.get())) {
// Find valid new index
++current_id_ %= signal_list_.size();
// Insert the new signal into the current slot and ignore any wait
signal_list_.insert(signal_list_.begin() + current_id_, signal.release());
new_signal = true;
}
}
@@ -429,22 +459,43 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
// The signal was assigned to the global marker's event, hence runtime can't reuse it
// and needs a new signal
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if (signal != nullptr) {
hsa_agent_t agent = gpu_.gpu_device();
const Settings& settings = gpu_.dev().settings();
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
if ((signal != nullptr) && CreateSignal(signal.get())) {
signal_list_[current_id_]->release();
signal_list_[current_id_] = signal.release();
} else {
assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!");
}
} else {
assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!");
}
}
bool enqueHandler = false;
if (AMD_DIRECT_DISPATCH) {
if (ts != nullptr) {
enqueHandler = (ts->command().Callback() != nullptr ||
ts->command().GetBatchHead() != nullptr ) &&
!ts->command().CpuWaitRequested();
}
// Check if the signal doesn't match the requested one.
// Note: runtime needs the interrupts for the callbacks in DD mode
if ((signal_list_[current_id_]->flags_.interrupt_ != enqueHandler) && gpu_.dev().ActiveWait()) {
// Use different stacks if interrupt is required
auto& stack_pop = (enqueHandler) ? signal_pool_irq_ : signal_pool_;
auto& stack_push = (enqueHandler) ? signal_pool_ : signal_pool_irq_;
// Check if a free signal in the pop stack isn't available
if (stack_pop.empty()) {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if ((signal != nullptr) && CreateSignal(signal.get(), enqueHandler)) {
stack_pop.push(signal.release());
}
}
// Make sure a free signal exists and replace it in the current slot
if (!stack_pop.empty()) {
stack_push.push(signal_list_[current_id_]);
signal_list_[current_id_] = stack_pop.top();
stack_pop.pop();
}
}
}
ProfilingSignal* prof_signal = signal_list_[current_id_];
// Reset the signal and return
hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val);
@@ -458,10 +509,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
prof_signal->ts_ = ts;
ts->AddProfilingSignal(prof_signal);
if (AMD_DIRECT_DISPATCH) {
bool enqueHandler = false;
enqueHandler = (ts->command().Callback() != nullptr ||
ts->command().GetBatchHead() != nullptr ) &&
!ts->command().CpuWaitRequested();
// If direct dispatch is enabled and the batch head isn't null, then it's a marker and
// requires the batch update upon HSA signal completion
if (enqueHandler) {
@@ -884,7 +931,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
}
ProfilingSignal* current_signal = Barriers().GetLastSignal();
current_signal->isPacketDispatch_ = true;
current_signal->flags_.isPacketDispatch_ = true;
}
}
+7
Просмотреть файл
@@ -31,6 +31,7 @@
#include "hsa/hsa_ven_amd_aqlprofile.h"
#include "rocsched.hpp"
#include "device/device.hpp"
#include <stack>
namespace amd::roc {
class Device;
@@ -297,7 +298,11 @@ class VirtualGPU : public device::VirtualDevice {
sdma_profiling_ = profile;
hsa_amd_profiling_async_copy_enable(profile);
}
private:
//! Creates HSA signal with the specified scope
bool CreateSignal(ProfilingSignal* signal, bool interrupt = false) const;
//! Wait for the next active signal
void WaitNext() {
size_t next = (current_id_ + 1) % signal_list_.size();
@@ -309,6 +314,8 @@ class VirtualGPU : public device::VirtualDevice {
bool CpuWaitForSignal(ProfilingSignal* signal);
HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
std::stack<ProfilingSignal*> signal_pool_irq_; //!< The pool of free signals with interrupts
std::stack<ProfilingSignal*> signal_pool_; //!< The pool of free signals without interrupt
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
size_t current_id_ = 0; //!< Last submitted signal
bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled