From 072fb0804e2a30e4645c2780fa8fe8a2eafad315 Mon Sep 17 00:00:00 2001 From: "Kudchadker, Saleel" Date: Fri, 25 Apr 2025 08:46:44 -0700 Subject: [PATCH] SWDEV-521647 - Fix tracking of hw_event (#206) - When a command may possibly have two packets(like device heap initializer), and if there is no signal on the main kernel packet the tracking was broken as it marked HW event of the command as the first packet signal. - Make sure if no completion signal is attached to the second packet then clear the HW event for the command. --- rocclr/device/rocm/rocdevice.cpp | 6 +++- rocclr/device/rocm/rocvirtual.cpp | 47 ++++++++++++++++++++----------- rocclr/device/rocm/rocvirtual.hpp | 2 +- rocclr/platform/command.cpp | 6 ++-- rocclr/platform/command.hpp | 2 +- 5 files changed, 39 insertions(+), 24 deletions(-) diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 66c652c7fc..282cb56c4c 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -2863,7 +2863,11 @@ bool Device::IsHwEventReady(const amd::Event& event, bool wait, uint32_t hip_eve bool active_wait = !(hip_event_flags & kHipEventBlockingSync) && ActiveWait(); return WaitForSignal(reinterpret_cast(hw_event)->signal_, active_wait); } - return (hsa_signal_load_relaxed(reinterpret_cast(hw_event)->signal_) == 0); + + auto signal = reinterpret_cast(hw_event)->signal_; + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Check HW event = 0x%lx", signal.handle); + + return (hsa_signal_load_relaxed(signal) == 0); } // ================================================================================================ diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 18d696a589..23b3102ca7 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -422,7 +422,20 @@ bool VirtualGPU::HwQueueTracker::Create() { // ================================================================================================ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( - hsa_signal_value_t init_val, Timestamp* ts) { + hsa_signal_value_t init_val, Timestamp* ts, bool attach_signal) { + + amd::Command* cmd = gpu_.command(); + // If no signal is needed, decrement the refcount and clear the hw_event of current command + if (!attach_signal) { + if (nullptr != cmd) { + if (cmd->HwEvent() != nullptr) { + reinterpret_cast(cmd->HwEvent())->release(); + } + cmd->SetHwEvent(nullptr); + } + return hsa_signal_t {0}; + } + bool new_signal = false; // Peep signal +2 ahead to see if its done @@ -503,8 +516,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( prof_signal->engine_ = engine_; prof_signal->flags_.isPacketDispatch_ = false; - // Store the HW event - amd::Command* cmd = gpu_.command(); + if (nullptr != cmd) { // Release any existing HwEvent before setting new one for the same command if (cmd->HwEvent() != nullptr) { @@ -1026,24 +1038,25 @@ bool VirtualGPU::dispatchGenericAqlPacket( fence_state_ = static_cast(expected_fence_state); - if (timestamp_ != nullptr || attach_signal) { - // Get active signal for current dispatch if profiling is necessary - packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); - - if (std::is_same::value) { - // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can - // retrieve this correlation ID to attribute waves to specific dispatch locations. - if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) { - auto dispatchPacket = reinterpret_cast(packet); - dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_; - } - - ProfilingSignal* current_signal = Barriers().GetLastSignal(); - current_signal->flags_.isPacketDispatch_ = true; + bool attachSignal = timestamp_ != nullptr || attach_signal; + // Get active signal for current dispatch if profiling is necessary + packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, + timestamp_, attachSignal); + if (std::is_same::value + && timestamp_ != nullptr) { + // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can + // retrieve this correlation ID to attribute waves to specific dispatch locations. + if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH) ) { + auto dispatchPacket = reinterpret_cast(packet); + dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_; } + + ProfilingSignal* current_signal = Barriers().GetLastSignal(); + current_signal->flags_.isPacketDispatch_ = true; } + // Make sure the slot is free for usage while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) { amd::Os::yield(); diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 43c927ff6e..4e8cd7fad3 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -254,7 +254,7 @@ class VirtualGPU : public device::VirtualDevice { //! Finds a free signal for the upcomming operation hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne, - Timestamp* ts = nullptr); + Timestamp* ts = nullptr, bool attach_signal = true); //! Wait for the curent active signal. Can idle the queue bool WaitCurrent() { diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index ebf47b8eb4..3f4fd6a24c 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -277,7 +277,6 @@ bool Event::notifyCmdQueue(bool cpu_wait) { notified_.clear(); return false; } - ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue); command->enqueue(); // Save notification, associated with the current event notify_event_ = command; @@ -290,7 +289,6 @@ bool Event::notifyCmdQueue(bool cpu_wait) { notified_.clear(); return false; } - ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue); command->enqueue(); command->release(); } @@ -356,8 +354,8 @@ void Command::enqueue() { Agent::postEventCreate(as_cl(static_cast(this)), type_); } - ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p", - amd::activity_prof::getOclCommandKindString(this->type()), this); + ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p", + amd::activity_prof::getOclCommandKindString(this->type()), this, queue_); // Direct dispatch logic below will submit the command immediately, but the command status // update will occur later after flush() with a wait diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 68cad257f6..0aebbc4b72 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -213,7 +213,7 @@ class Event : public RuntimeObject { //! Returns the callback for this event const CallBackEntry* Callback() const { return callbacks_; } - // Saves HW event, associated with the current command + //! Saves HW event, associated with the current command void SetHwEvent(void* hw_event) { hw_event_ = hw_event; } //! Returns HW event, associated with the current command