diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index e4ff536840..2f2f54f924 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -449,8 +449,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d engine = HwQueueEngine::SdmaRead; } - hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine); - uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; + auto wait_events = gpu().Barriers().WaitingSignal(engine); if (isSubwindowRectCopy ) { hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); @@ -458,10 +457,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d // Copy memory line by line ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t HSA Asycn Copy Rect wait_event=0x%zx, completion_signal=0x%zx", - std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0, + std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset, - &srcMem, &offset, &dim, agent, direction, num_wait_events, wait_event, active); + &srcMem, &offset, &dim, agent, direction, wait_events.size(), &wait_events[0], active); if (status != HSA_STATUS_SUCCESS) { gpu().Barriers().ResetCurrentSignal(); LogPrintfError("DMA buffer failed with code %d", status); @@ -480,12 +479,12 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d // Copy memory line by line ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx", - std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0, + std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); hsa_status_t status = hsa_amd_memory_async_copy( (reinterpret_cast
(dst) + dstOffset), dstAgent, (reinterpret_cast(src) + srcOffset), srcAgent, - size[0], num_wait_events, wait_event, active); + size[0], wait_events.size(), &wait_events[0], active); if (status != HSA_STATUS_SUCCESS) { gpu().Barriers().ResetCurrentSignal(); LogPrintfError("DMA buffer failed with code %d", status); @@ -662,18 +661,17 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, engine = HwQueueEngine::SdmaRead; } - hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine); - uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; - hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); + auto wait_events = gpu().Barriers().WaitingSignal(engine); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Use SDMA to transfer the data ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx", - std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0, + std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, - size[0], num_wait_events, wait_event, active); + size[0], wait_events.size(), &wait_events[0], active); if (status == HSA_STATUS_SUCCESS) { gpu().addSystemScope(); } else { diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 4e355e4759..fcfb7fd8a5 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -389,9 +389,11 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( } // ================================================================================================ -hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) { +std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) { bool explicit_wait = false; - hsa_signal_t* signal = nullptr; + // Rest all current waiting signals + waiting_signals_.clear(); + // Does runtime switch the active engine? if (engine != engine_) { // Yes, return the signla from the previous operation for a wait @@ -412,30 +414,32 @@ hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) { } // Check if a wait is required if (explicit_wait) { - ProfilingSignal* prof_signal; - // Check if there is an external signal - if (external_signal_ != nullptr) { - prof_signal = external_signal_; - external_signal_ = nullptr; - } else { - prof_signal = signal_list_[current_id_]; - } - // Early signal status check - if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) { - const Settings& settings = gpu_.dev().settings(); - // Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU - if (!WaitForSignal(prof_signal->signal_)) { - if (settings.cpu_wait_for_signal_) { - // Wait on CPU for completion if requested - CpuWaitForSignal(prof_signal); - } else { - // Return HSA signal for tracking on GPU - return &prof_signal->signal_; + ProfilingSignal** prof_signal; + // Add the oldest signal into the tracking for a wait + external_signals_.push_back(signal_list_[current_id_]); + prof_signal = &external_signals_[0]; + + // Validate all signals for the wait and skip already completed + for (uint32_t i = 0; i < external_signals_.size(); ++i) { + // Early signal status check + if (hsa_signal_load_relaxed(prof_signal[i]->signal_) > 0) { + const Settings& settings = gpu_.dev().settings(); + // Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU + if (!WaitForSignal(prof_signal[i]->signal_)) { + if (settings.cpu_wait_for_signal_) { + // Wait on CPU for completion if requested + CpuWaitForSignal(prof_signal[i]); + } else { + // Add HSA signal for tracking on GPU + waiting_signals_.push_back(prof_signal[i]->signal_); + } } } } + external_signals_.clear(); } - return signal; + // Return the array of waiting HSA signals + return waiting_signals_; } // ================================================================================================ @@ -790,18 +794,32 @@ bool VirtualGPU::dispatchGenericAqlPacket( return true; } +// ================================================================================================ +void VirtualGPU::dispatchBlockingWait() { + auto wait_signals = Barriers().WaitingSignal(); + // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated + if (wait_signals.size() != 0) { + for (uint32_t i = 0; i < wait_signals.size(); ++i) { + uint32_t j = i % 5; + barrier_packet_.dep_signal[j] = wait_signals[i]; + constexpr bool kSkipSignal = true; + // If runtime reached the packet limit or the count limit, then flush the barrier + if ((j == 4) || ((i + 1) == wait_signals.size())) { + dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal); + barrier_packet_.dep_signal[0] = hsa_signal_t{}; + barrier_packet_.dep_signal[1] = hsa_signal_t{}; + barrier_packet_.dep_signal[2] = hsa_signal_t{}; + barrier_packet_.dep_signal[3] = hsa_signal_t{}; + barrier_packet_.dep_signal[4] = hsa_signal_t{}; + } + } + } +} + // ================================================================================================ bool VirtualGPU::dispatchAqlPacket( hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) { - hsa_signal_t* wait = Barriers().WaitingSignal(); - // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated - if (wait != nullptr) { - barrier_packet_.dep_signal[0] = *wait; - constexpr bool kSkipSignal = true; - dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal); - } else { - barrier_packet_.dep_signal[0] = hsa_signal_t{}; - } + dispatchBlockingWait(); return dispatchGenericAqlPacket(packet, header, rest, blocking); } @@ -851,10 +869,10 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet, packet->completion_signal.handle = 0; if (!skipSignal) { + dispatchBlockingWait(); + // Pool size must grow to the size of pending AQL packets const uint32_t pool_size = index - read; - hsa_signal_t* wait = Barriers().WaitingSignal(); - packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{}; // Get active signal for current dispatch if profiling is necessary packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, @@ -1143,6 +1161,20 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { timestamp_ = new Timestamp(this, command); timestamp_->start(); } + + if (AMD_DIRECT_DISPATCH) { + for (auto it = command.eventWaitList().begin(); it < command.eventWaitList().end(); ++it) { + void* hw_event = ((*it)->NotifyEvent() != nullptr) ? + (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent(); + if (hw_event != nullptr) { + Barriers().AddExternalSignal(reinterpret_cast(hw_event)); + } else if (static_cast(*it)->queue() != command.queue()) { + LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it); + } else { + // Assume serialization on the same queue... + } + } + } } // ================================================================================================ @@ -1157,6 +1189,9 @@ void VirtualGPU::profilingEnd(amd::Command& command) { timestamp_->end(); } command.setData(timestamp_); + if (AMD_DIRECT_DISPATCH) { + command.SetHwEvent(timestamp_->Signals().back()); + } timestamp_ = nullptr; } } @@ -1467,9 +1502,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { if (dev().info().hmmSupported_) { // Initialize signal for the barrier - hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown); - hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); - uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; + auto wait_events = Barriers().WaitingSignal(HwQueueEngine::Unknown); + hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); // Find the requested agent for the transfer hsa_agent_t agent = (cmd.cpu_access() || @@ -1478,7 +1512,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { // Initiate a prefetch command hsa_status_t status = hsa_amd_svm_prefetch_async( - const_cast(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active); + const_cast(cmd.dev_ptr()), cmd.count(), agent, + wait_events.size(), &wait_events[0], active); // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) { @@ -2785,7 +2820,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { queue->profilingBegin(vcmd); // Add a dependency into the device queue on the current queue - queue->Barriers().SetExternalSignal(Barriers().GetLastSignal()); + queue->Barriers().AddExternalSignal(Barriers().GetLastSignal()); if (vcmd.cooperativeGroups()) { // Initialize GWS if it's cooperative groups launch @@ -2812,7 +2847,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { queue->releaseGpuMemoryFence(kSkipCpuWait); // Add a dependency into the current queue on the coop queue - Barriers().SetExternalSignal(queue->Barriers().GetLastSignal()); + Barriers().AddExternalSignal(queue->Barriers().GetLastSignal()); hasPendingDispatch_ = true; queue->profilingEnd(vcmd); @@ -2885,18 +2920,12 @@ void VirtualGPU::flush(amd::Command* list, bool wait) { amd::Command* current = list; assert(current != nullptr && "Empty batch for processing!"); - // HIP tests expect callbacks processed from another thread, hence force AQL barrier always, so - // HSA signal callback will process HIP callback asynchronously - if (list->Callback() != nullptr) { - hasPendingDispatch_ = true; - } + // Find the last command while (current->getNext() != nullptr) { current = current->getNext(); - if (current->Callback() != nullptr) { - hasPendingDispatch_ = true; - } } - + // Always insert a barrier. Some tests rquire async SDMA wait + hasPendingDispatch_ = true; // Enable profiling, so runtime can track TS profilingBegin(*current); diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index bfe3dceb39..6545cfbff3 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -206,18 +206,18 @@ class VirtualGPU : public device::VirtualDevice { void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; } //! Returns the last submitted signal for a wait - hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute); + std::vector& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute); //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure. void ResetCurrentSignal(); - //! Inserts an external signal(submission in another queue) for dependency tracking - void SetExternalSignal(ProfilingSignal* signal) { - external_signal_ = signal; + //! Adds an external signal(submission in another queue) for dependency tracking + void AddExternalSignal(ProfilingSignal* signal) { + external_signals_.push_back(signal); engine_ = HwQueueEngine::External; } - //! Inserts an external signal(submission in another queue) for dependency tracking + //! Get the last active signal on the queue ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; } private: @@ -235,10 +235,11 @@ class VirtualGPU : public device::VirtualDevice { HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations std::vector signal_list_; //!< The pool of all signals for processing - ProfilingSignal* external_signal_ = nullptr; //!< Dependency on external signal size_t current_id_ = 0; //!< Last submitted signal bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker + std::vector external_signals_; //!< External signals for a wait in this queue + std::vector waiting_signals_; //!< Current waiting signals in this queue }; VirtualGPU(Device& device, bool profiling = false, bool cooperative = false, @@ -354,9 +355,12 @@ class VirtualGPU : public device::VirtualDevice { void profilerAttach(bool enable = false) { profilerAttached_ = enable; } - bool isProfilerAttached() { return profilerAttached_; } + bool isProfilerAttached() const { return profilerAttached_; } // } roc OpenCL integration private: + //! Dispatches a barrier with blocking HSA signals + void dispatchBlockingWait(); + bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index 4e0cad1caf..b8dfb244ae 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -42,16 +42,22 @@ namespace amd { +// ================================================================================================ Event::Event(HostQueue& queue) : callbacks_(NULL), status_(CL_INT_MAX), + hw_event_(nullptr), + notify_event_(nullptr), profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) || Agent::shouldPostEventEvents()) { notified_.clear(); } -Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED) { notified_.clear(); } +// ================================================================================================ +Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED), + hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); } +// ================================================================================================ Event::~Event() { CallBackEntry* callback = callbacks_; while (callback != NULL) { @@ -61,6 +67,7 @@ Event::~Event() { } } +// ================================================================================================ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) { if (timeStamp == 0) { timeStamp = Os::timeNanos(); @@ -88,7 +95,7 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) { // Global epoch time since the first processed command uint64_t epoch = 0; - +// ================================================================================================ bool Event::setStatus(int32_t status, uint64_t timeStamp) { assert(status <= CL_QUEUED && "invalid status"); @@ -157,6 +164,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) { return true; } +// ================================================================================================ bool Event::resetStatus(int32_t status) { int32_t currentStatus = this->status(); if (currentStatus != CL_COMPLETE) { @@ -171,6 +179,7 @@ bool Event::resetStatus(int32_t status) { return true; } +// ================================================================================================ bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void* data) { assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status"); @@ -193,7 +202,7 @@ bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void* return true; } - +// ================================================================================================ void Event::processCallbacks(int32_t status) const { cl_event event = const_cast(as_cl(this)); const int32_t mask = (status > CL_COMPLETE) ? status : CL_COMPLETE; @@ -212,6 +221,7 @@ void Event::processCallbacks(int32_t status) const { } } +// ================================================================================================ bool Event::awaitCompletion() { if (status() > CL_COMPLETE) { // Notifies current command queue about waiting @@ -219,7 +229,8 @@ bool Event::awaitCompletion() { return false; } - ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", this, status()); + ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", + this, status()); auto* queue = command().queue(); if ((queue != nullptr) && queue->vdev()->ActiveWait()) { while (status() > CL_COMPLETE) { @@ -262,6 +273,8 @@ bool Event::notifyCmdQueue() { ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue); command->enqueue(); command->release(); + // Save notification, associated with the current event + notify_event_ = command; } return true; } @@ -306,10 +319,10 @@ void Command::enqueue() { // update will occur later after flush() with a wait if (AMD_DIRECT_DISPATCH) { setStatus(CL_QUEUED); - // The wait should be performed before the lock, - // otherwise signal handler may have a deadlock, but awaitCompletion() is thread safe itself + // Notify all commands about the waiter. Barrier will be sent in order to obtain + // HSA signal for a wait on the current queue std::for_each(eventWaitList().begin(), eventWaitList().end(), - std::mem_fun(&Command::awaitCompletion)); + std::mem_fun(&Command::notifyCmdQueue)); // The batch update must be lock protected to avoid a race condition // when multiple threads submit/flush/update the batch at the same time diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 68c249b1a6..21bce75a09 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -93,6 +93,8 @@ class Event : public RuntimeObject { std::atomic callbacks_; //!< linked list of callback entries. std::atomic status_; //!< current execution status. std::atomic_flag notified_; //!< Command queue was notified + void* hw_event_; //!< HW event ID associated with SW event + Event* notify_event_; //!< Notify event, which should contain HW signal protected: static const EventWaitList nullWaitList; @@ -210,6 +212,15 @@ class Event : public RuntimeObject { //! Returns the callback for this event const CallBackEntry* Callback() const { return callbacks_; } + + // Saves HW event, associated with the current command + void SetHwEvent(void* hw_event) { hw_event_ = hw_event; } + + //! Returns HW event, associated with the current command + void* HwEvent() const { return hw_event_; } + + //! Returns notify even associated with the current command + Event* NotifyEvent() const { return notify_event_; } }; /*! \brief An operation that is submitted to a command queue.