diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 8e0ffa2bcc..e8e21a5c7f 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -115,7 +115,9 @@ void Timestamp::checkGpuTime() { uint64_t end = 0; for (auto it : signals_) { - if (hsa_signal_load_relaxed(it->signal_) > 0) { + // Ignore the wait if runtime processes API callback, because the signal value is bigger + // than expected and the value reset will occur after API callback is done + if (GetCallbackSignal().handle == 0) { WaitForSignal(it->signal_); } // Avoid profiling data for the sync barrier, in tiny performance tests the first call @@ -153,7 +155,7 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) { ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) { return false; } - amd::ScopedLock sl(ts->gpu()->execution()); + if (ts->gpu()->isProfilerAttached()) { amd::Command* head = ts->getParsedCommand(); if (head == nullptr) { @@ -185,9 +187,17 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(0x%lx)", static_cast(value), arg, ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0); + // Save callback signal + hsa_signal_t callback_signal = ts->GetCallbackSignal(); + // Update the batch, since signal is complete ts->gpu()->updateCommandsState(ts->command().GetBatchHead()); + // Reset API callback signal. It will release AQL queue and start commands processing + if (callback_signal.handle != 0) { + hsa_signal_subtract_relaxed(callback_signal, 1); + } + // Return false, so the callback will not be called again for this signal return false; } @@ -371,8 +381,17 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // If direct dispatch is enabled and the batch head isn't null, then it's a marker and // requires the batch update upon HSA signal completion if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) { + uint32_t init_value = kInitSignalValueOne; + // If API callback is enabled, then use a blocking signal for AQL queue. + // HSA signal will be acquired in SW and released after HSA signal callback + if (ts->command().Callback() != nullptr) { + ts->SetCallbackSignal(prof_signal->signal_); + // Blocks AQL queue from further processing + hsa_signal_add_relaxed(prof_signal->signal_, 1); + init_value += 1; + } hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_, - HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, &HsaAmdSignalHandler, ts); + HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts); if (HSA_STATUS_SUCCESS != result) { LogError("hsa_amd_signal_async_handler() failed to set the handler!"); } else { @@ -396,7 +415,7 @@ std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi // Does runtime switch the active engine? if (engine != engine_) { - // Yes, return the signla from the previous operation for a wait + // Yes, return the signal from the previous operation for a wait engine_ = engine; explicit_wait = true; } else { @@ -404,8 +423,8 @@ std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi if (engine == HwQueueEngine::Unknown) { explicit_wait = true; } else { - // Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr - // and ignore signal wait, relying on in-order engine execution + // Check if skip wait optimization is enabled. It will try to predict the same engine in ROCr + // and ignore the signal wait, relying on in-order engine execution const Settings& settings = gpu_.dev().settings(); if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) { explicit_wait = true; @@ -414,24 +433,33 @@ std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi } // Check if a wait is required if (explicit_wait) { - ProfilingSignal** prof_signal; + bool skip_internal_signal = false; + + for (uint32_t i = 0; i < external_signals_.size(); ++i) { + // If external signal matches internal one, then skip it + if (external_signals_[i]->signal_.handle == + signal_list_[current_id_]->signal_.handle) { + skip_internal_signal = true; + } + } // Add the oldest signal into the tracking for a wait - external_signals_.push_back(signal_list_[current_id_]); - prof_signal = &external_signals_[0]; + if (!skip_internal_signal) { + external_signals_.push_back(signal_list_[current_id_]); + } // Validate all signals for the wait and skip already completed for (uint32_t i = 0; i < external_signals_.size(); ++i) { // Early signal status check - if (hsa_signal_load_relaxed(prof_signal[i]->signal_) > 0) { + if (hsa_signal_load_relaxed(external_signals_[i]->signal_) > 0) { const Settings& settings = gpu_.dev().settings(); // Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU - if (!WaitForSignal(prof_signal[i]->signal_)) { + if (!WaitForSignal(external_signals_[i]->signal_)) { if (settings.cpu_wait_for_signal_) { // Wait on CPU for completion if requested - CpuWaitForSignal(prof_signal[i]); + CpuWaitForSignal(external_signals_[i]); } else { // Add HSA signal for tracking on GPU - waiting_signals_.push_back(prof_signal[i]->signal_); + waiting_signals_.push_back(external_signals_[i]->signal_); } } } @@ -891,12 +919,6 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) { __atomic_store_n(reinterpret_cast(aql_loc), packetHeader, __ATOMIC_RELEASE); hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); - // Clear dependent signals for the next packet - barrier_packet_.dep_signal[0] = hsa_signal_t{}; - barrier_packet_.dep_signal[1] = hsa_signal_t{}; - barrier_packet_.dep_signal[2] = hsa_signal_t{}; - barrier_packet_.dep_signal[3] = hsa_signal_t{}; - barrier_packet_.dep_signal[4] = hsa_signal_t{}; ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, "[%zx] HWq=0x%zx, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d," " release=%d), " @@ -913,6 +935,12 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) { barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1], barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3], barrier_packet_.dep_signal[4], barrier_packet_.completion_signal); + // Clear dependent signals for the next packet + barrier_packet_.dep_signal[0] = hsa_signal_t{}; + barrier_packet_.dep_signal[1] = hsa_signal_t{}; + barrier_packet_.dep_signal[2] = hsa_signal_t{}; + barrier_packet_.dep_signal[3] = hsa_signal_t{}; + barrier_packet_.dep_signal[4] = hsa_signal_t{}; } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 85987083b1..f39d901bfb 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -90,7 +90,11 @@ class Timestamp : public amd::HeapObject { VirtualGPU* gpu_; //!< Virtual GPU, associated with this timestamp const amd::Command& command_; //!< Command, associated with this timestamp amd::Command* parsedCommand_; //!< Command down the list, considering command_ as head - std::vector signals_; + std::vector signals_; //!< The list of all signals, associated with the TS + hsa_signal_t callback_signal_; //!< Signal associated with a callback for possible later update + + Timestamp(const Timestamp&) = delete; + Timestamp& operator=(const Timestamp&) = delete; public: Timestamp(VirtualGPU* gpu, const amd::Command& command) @@ -98,7 +102,8 @@ class Timestamp : public amd::HeapObject { , end_(0) , gpu_(gpu) , command_(command) - , parsedCommand_(nullptr) {} + , parsedCommand_(nullptr) + , callback_signal_(hsa_signal_t{}) {} ~Timestamp() {} @@ -141,6 +146,14 @@ class Timestamp : public amd::HeapObject { //! Returns virtual GPU device, used with this timestamp VirtualGPU* gpu() const { return gpu_; } + + //! Updates the callback signal + void SetCallbackSignal(hsa_signal_t callback_signal) { + callback_signal_ = callback_signal; + } + + //! Returns the callback signal + hsa_signal_t GetCallbackSignal() const { return callback_signal_; } }; class VirtualGPU : public device::VirtualDevice {