diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index be27ae96da..69ab3590df 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -3012,36 +3012,22 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) { } // ================================================================================================ -ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const { - std::unique_ptr prof_signal(new ProfilingSignal()); - if (prof_signal != nullptr) { - hsa_agent_t agent = getBackendDevice(); - hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent; - uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1; - - if (ts != 0) { - // Save HSA signal earlier to make sure the possible callback will have a valid - // value for processing - prof_signal->ts_ = ts; - ts->AddProfilingSignal(prof_signal.get()); - } - - if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne, - num_agents, agents, &prof_signal->signal_)) { - return prof_signal.release(); - } +void Device::ReleaseGlobalSignal(void* signal) const { + if (signal != nullptr) { + reinterpret_cast(signal)->release(); } - return nullptr; } // ================================================================================================ -void Device::ReleaseGlobalSignal(void* signal) const { - if (signal != nullptr) { - ProfilingSignal* prof_signal = reinterpret_cast(signal); - if (prof_signal->signal_.handle != 0) { - hsa_signal_destroy(prof_signal->signal_); +ProfilingSignal::~ProfilingSignal() { + if (signal_.handle != 0) { + if (hsa_signal_load_relaxed(signal_) > 0) { + LogError("Runtime shouldn't destroy a signal that is still busy!"); + if (hsa_signal_wait_scacquire(signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, + kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) { + } } - delete prof_signal; + hsa_signal_destroy(signal_); } } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 572d817c6e..e8f4791042 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -77,7 +77,8 @@ class VirtualDevice; class PrintfDbg; class IProDevice; -struct ProfilingSignal : public amd::HeapObject { +class ProfilingSignal : public amd::ReferenceCountedObject { +public: hsa_signal_t signal_; //!< HSA signal to track profiling information Timestamp* ts_; //!< Timestamp object associated with the signal HwQueueEngine engine_; //!< Engine used with this signal @@ -89,6 +90,8 @@ struct ProfilingSignal : public amd::HeapObject { , done_(true) , lock_("Signal Ops Lock", true) { signal_.handle = 0; } + + virtual ~ProfilingSignal(); amd::Monitor& LockSignalOps() { return lock_; } }; @@ -531,8 +534,6 @@ class Device : public NullDevice { virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset); - ProfilingSignal* GetGlobalSignal(Timestamp* ts) const; - private: bool create(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 07147600fb..96172b6aff 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -319,10 +319,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) { // ================================================================================================ VirtualGPU::HwQueueTracker::~HwQueueTracker() { for (auto& signal: signal_list_) { - if (signal->signal_.handle != 0) { - hsa_signal_destroy(signal->signal_); - } - delete signal; + signal->release(); } } @@ -374,6 +371,26 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // a GPU waiter(which may be not triggered yet) and CPU signal reset below WaitNext(); + if (signal_list_[current_id_]->referenceCount() > 1) { + // The signal was assigned to the global marker's event, hence runtime can't reuse it + // and needs a new signal + std::unique_ptr signal(new ProfilingSignal()); + if (signal != nullptr) { + hsa_agent_t agent = gpu_.gpu_device(); + const Settings& settings = gpu_.dev().settings(); + hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent; + uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1; + + if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) { + signal_list_[current_id_]->release(); + signal_list_[current_id_] = signal.release(); + } else { + assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!"); + } + } else { + assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!"); + } + } ProfilingSignal* prof_signal = signal_list_[current_id_]; // Reset the signal and return hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val); @@ -387,7 +404,23 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // If direct dispatch is enabled and the batch head isn't null, then it's a marker and // requires the batch update upon HSA signal completion if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) { - assert(false && "Runtime should not have batch command in ActiveSignal!"); + uint32_t init_value = kInitSignalValueOne; + // If API callback is enabled, then use a blocking signal for AQL queue. + // HSA signal will be acquired in SW and released after HSA signal callback + if (ts->command().Callback() != nullptr) { + ts->SetCallbackSignal(prof_signal->signal_); + // Blocks AQL queue from further processing + hsa_signal_add_relaxed(prof_signal->signal_, 1); + init_value += 1; + } + hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_, + HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts); + if (HSA_STATUS_SUCCESS != result) { + LogError("hsa_amd_signal_async_handler() failed to set the handler!"); + } else { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)", + prof_signal->signal_.handle, prof_signal); + } } if (!sdma_profiling_) { hsa_amd_profiling_async_copy_enable(true); @@ -872,8 +905,7 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, } // ================================================================================================ -void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, - bool skipSignal, const ProfilingSignal* global_signal) { +void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) { const uint32_t queueSize = gpu_queue_->size; const uint32_t queueMask = queueSize - 1; @@ -896,16 +928,12 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, barrier_packet_.completion_signal.handle = 0; if (!skipSignal) { - if (global_signal != nullptr) { - barrier_packet_.completion_signal = global_signal->signal_; - } else { - // Pool size must grow to the size of pending AQL packets - const uint32_t pool_size = index - read; + // Pool size must grow to the size of pending AQL packets + const uint32_t pool_size = index - read; - // Get active signal for current dispatch if profiling is necessary - barrier_packet_.completion_signal = - Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); - } + // Get active signal for current dispatch if profiling is necessary + barrier_packet_.completion_signal = + Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); } while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); @@ -1226,6 +1254,12 @@ void VirtualGPU::profilingEnd(amd::Command& command) { } command.setData(timestamp_); + // Update HW event only for batches + if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) { + timestamp_->Signals().back()->retain(); + command.SetHwEvent(timestamp_->Signals().back()); + } + timestamp_ = nullptr; } } @@ -2889,7 +2923,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { queue->profilingEnd(vcmd); } else { - // Make sure VirtualGPU has an exclusive access to the resources + // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); @@ -2913,47 +2947,23 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { // ================================================================================================ void VirtualGPU::submitMarker(amd::Marker& vcmd) { if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) { - profilingBegin(vcmd); - if (timestamp_ != nullptr) { - ProfilingSignal* prof_signal = nullptr; - // If direct dispatch is enabled and the batch head isn't null, then it's a marker and - // requires the batch update upon HSA signal completion - if (AMD_DIRECT_DISPATCH) { - assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!"); + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + if (vcmd.CpuWaitRequested() && hasPendingDispatch_ == false) { + // It should be safe to call flush directly if there are not pending dispatches without + // HSA signal callback + flush(vcmd.GetBatchHead()); + } else { + profilingBegin(vcmd); + if (timestamp_ != nullptr) { + // Submit a barrier with a cache flushes. + dispatchBarrierPacket(kBarrierPacketHeader, false); - prof_signal = dev().GetGlobalSignal(timestamp_); - prof_signal->done_ = false; - - assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!"); - uint32_t init_value = kInitSignalValueOne; - // If API callback is enabled, then use a blocking signal for AQL queue. - // HSA signal will be acquired in SW and released after HSA signal callback - if (vcmd.Callback() != nullptr) { - timestamp_->SetCallbackSignal(prof_signal->signal_); - // Blocks AQL queue from further processing - hsa_signal_add_relaxed(prof_signal->signal_, 1); - init_value += 1; - } - - hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_, - HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_); - if (HSA_STATUS_SUCCESS != result) { - LogError("hsa_amd_signal_async_handler() failed to set the handler!"); - } else { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)", - prof_signal->signal_.handle, prof_signal); - } - // Update HW event only for batches - vcmd.SetHwEvent(timestamp_->Signals().back()); + hasPendingDispatch_ = false; } - // Submit a barrier with a cache flushes. - dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal); - - // Don't reset the flag for direct dispatch, because the global signals are out of scope - // for internal barrier tracking and SDMA could lose a wait for compute - hasPendingDispatch_ = AMD_DIRECT_DISPATCH; + profilingEnd(vcmd); } - profilingEnd(vcmd); + } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 85f938370b..f980358f20 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -223,6 +223,7 @@ class VirtualGPU : public device::VirtualDevice { //! Update current active engine void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; } + HwQueueEngine GetActiveEngine() const { return engine_; } //! Returns the last submitted signal for a wait std::vector& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute); @@ -385,8 +386,7 @@ class VirtualGPU : public device::VirtualDevice { template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size = 1); - void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false, - const ProfilingSignal* global_signal = nullptr); + void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false); bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet, diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 4c53d55c52..f4567f4179 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -232,11 +232,12 @@ void Event::processCallbacks(int32_t status) const { } } +static constexpr bool kCpuWait = true; // ================================================================================================ bool Event::awaitCompletion() { if (status() > CL_COMPLETE) { - // Notifies current command queue about waiting - if (!notifyCmdQueue()) { + // Notifies the current command queue about waiting + if (!notifyCmdQueue(kCpuWait)) { return false; } @@ -262,7 +263,7 @@ bool Event::awaitCompletion() { } // ================================================================================================ -bool Event::notifyCmdQueue() { +bool Event::notifyCmdQueue(bool cpu_wait) { HostQueue* queue = command().queue(); if (AMD_DIRECT_DISPATCH) { ScopedLock l(notify_lock_); @@ -271,7 +272,7 @@ bool Event::notifyCmdQueue() { (HwEvent() == nullptr) && !notified_.test_and_set()) { // Make sure the queue is draining the enqueued commands. - amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this); + amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait); if (command == NULL) { notified_.clear(); return false; @@ -341,7 +342,7 @@ void Command::enqueue() { // Notify all commands about the waiter. Barrier will be sent in order to obtain // HSA signal for a wait on the current queue std::for_each(eventWaitList().begin(), eventWaitList().end(), - std::mem_fun(&Command::notifyCmdQueue)); + std::bind2nd(std::mem_fun(&Command::notifyCmdQueue), !kCpuWait)); // The batch update must be lock protected to avoid a race condition // when multiple threads submit/flush/update the batch at the same time diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 36e71360a7..7282a28c1a 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -207,7 +207,7 @@ class Event : public RuntimeObject { /*! \brief Notifies current command queue about execution status */ - bool notifyCmdQueue(); + bool notifyCmdQueue(bool cpu_wait = false); //! RTTI internal implementation virtual ObjectType objectType() const { return ObjectTypeEvent; } @@ -998,15 +998,22 @@ class ExternalSemaphoreCmd : public Command { class Marker : public Command { + private: + bool cpu_wait_; //!< If true, then the marker was issued for CPU/GPU sync + public: //! Create a new Marker Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList, - const Event* waitingEvent = nullptr) - : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) {} + const Event* waitingEvent = nullptr, bool cpu_wait = false) + : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) + , cpu_wait_(cpu_wait) {} //! The actual command implementation. virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); } + //! Check if this marker requires CPU wait + bool CpuWaitRequested() const { return cpu_wait_; } + }; /*! \brief Maps CL objects created from external ones and syncs the contents (blocking).