diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 29346d2380..7c0e13db1f 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -49,6 +49,7 @@ #include #include #include +#include #include namespace amd { @@ -1705,6 +1706,7 @@ class Device : public RuntimeObject { ) const { return false; }; + virtual void ReleaseGlobalSignal(void* signal) const {} //! Returns TRUE if the device is available for computations bool isOnline() const { return online_; } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 0551e0e983..c6e94732a6 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2919,6 +2919,7 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool, return true; } +// ================================================================================================ void Device::getGlobalCUMask(std::string cuMaskStr) { if (cuMaskStr.length() != 0) { std::string pre = cuMaskStr.substr(0, 2); @@ -2971,10 +2972,12 @@ void Device::getGlobalCUMask(std::string cuMaskStr) { } } +// ================================================================================================ device::Signal* Device::createSignal() const { return new roc::Signal(); } +// ================================================================================================ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) { // If arena_mem_obj_ is null, then HMM and Xnack is disabled. Return nullptr. if (arena_mem_obj_ == nullptr) { @@ -2989,5 +2992,39 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) { return arena_mem_obj_; } +// ================================================================================================ +ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const { + std::unique_ptr prof_signal(new ProfilingSignal()); + if (prof_signal != nullptr) { + hsa_agent_t agent = getBackendDevice(); + hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent; + uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1; + + if (ts != 0) { + // Save HSA signal earlier to make sure the possible callback will have a valid + // value for processing + prof_signal->ts_ = ts; + ts->AddProfilingSignal(prof_signal.get()); + } + + if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne, + num_agents, agents, &prof_signal->signal_)) { + return prof_signal.release(); + } + } + return nullptr; +} + +// ================================================================================================ +void Device::ReleaseGlobalSignal(void* signal) const { + if (signal != nullptr) { + ProfilingSignal* prof_signal = reinterpret_cast(signal); + if (prof_signal->signal_.handle != 0) { + hsa_signal_destroy(prof_signal->signal_); + } + delete prof_signal; + } +} + } // namespace roc #endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 78cfbb6241..0fb6f0b8e0 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -77,6 +77,21 @@ class VirtualDevice; class PrintfDbg; class IProDevice; +struct ProfilingSignal : public amd::HeapObject { + hsa_signal_t signal_; //!< HSA signal to track profiling information + Timestamp* ts_; //!< Timestamp object associated with the signal + HwQueueEngine engine_; //!< Engine used with this signal + bool done_; //!< True if signal is done + amd::Monitor lock_; //!< Signal lock for update + ProfilingSignal() + : ts_(nullptr) + , engine_(HwQueueEngine::Compute) + , done_(true) + , lock_("Signal Ops Lock", true) + { signal_.handle = 0; } + amd::Monitor& LockSignalOps() { return lock_; } +}; + class Sampler : public device::Sampler { public: //! Constructor @@ -237,6 +252,7 @@ class NullDevice : public amd::Device { cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; } virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; } + virtual void ReleaseGlobalSignal(void* signal) const {} protected: //! Initialize compiler instance and handle @@ -405,6 +421,7 @@ class Device : public NullDevice { cl_set_device_clock_mode_output_amd* pSetClockModeOutput); virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const; + virtual void ReleaseGlobalSignal(void* signal) const; //! Allocate host memory in terms of numa policy set by user void* hostNumaAlloc(size_t size, size_t alignment, bool atomics = false) const; @@ -505,6 +522,8 @@ class Device : public NullDevice { virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset); + ProfilingSignal* GetGlobalSignal(Timestamp* ts) const; + private: bool create(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 5f2659c76a..d31f01982c 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -109,7 +109,7 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) { }; // ================================================================================================ -void Timestamp::checkGpuTime(bool event_recycle) { +void Timestamp::checkGpuTime() { if (HwProfiling()) { uint64_t start = std::numeric_limits::max(); uint64_t end = 0; @@ -140,10 +140,6 @@ void Timestamp::checkGpuTime(bool event_recycle) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, " "end = %ld", it->signal_.handle, start, end); } - // The signal is reused and the upper layer can't rely on it. - if (event_recycle) { - const_cast(it->ts_->command_).SetHwEvent(nullptr); - } it->ts_ = nullptr; it->done_ = true; } @@ -390,23 +386,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // If direct dispatch is enabled and the batch head isn't null, then it's a marker and // requires the batch update upon HSA signal completion if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) { - uint32_t init_value = kInitSignalValueOne; - // If API callback is enabled, then use a blocking signal for AQL queue. - // HSA signal will be acquired in SW and released after HSA signal callback - if (ts->command().Callback() != nullptr) { - ts->SetCallbackSignal(prof_signal->signal_); - // Blocks AQL queue from further processing - hsa_signal_add_relaxed(prof_signal->signal_, 1); - init_value += 1; - } - hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_, - HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts); - if (HSA_STATUS_SUCCESS != result) { - LogError("hsa_amd_signal_async_handler() failed to set the handler!"); - } else { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)", - prof_signal->signal_.handle, prof_signal); - } + assert(false && "Runtime should not have batch command in ActiveSignal!"); } if (!sdma_profiling_) { hsa_amd_profiling_async_copy_enable(true); @@ -486,8 +466,7 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) { if (!signal->done_) { // Update timestamp values if requested if (signal->ts_ != nullptr) { - static constexpr bool kEventRecycle = true; - signal->ts_->checkGpuTime(kEventRecycle); + signal->ts_->checkGpuTime(); } else { ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t Host wait on completion_signal=0x%zx", std::this_thread::get_id(), signal->signal_.handle); @@ -892,7 +871,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, } // ================================================================================================ -void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) { +void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, + bool skipSignal, const ProfilingSignal* global_signal) { const uint32_t queueSize = gpu_queue_->size; const uint32_t queueMask = queueSize - 1; @@ -915,12 +895,16 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) { barrier_packet_.completion_signal.handle = 0; if (!skipSignal) { - // Pool size must grow to the size of pending AQL packets - const uint32_t pool_size = index - read; + if (global_signal != nullptr) { + barrier_packet_.completion_signal = global_signal->signal_; + } else { + // Pool size must grow to the size of pending AQL packets + const uint32_t pool_size = index - read; - // Get active signal for current dispatch if profiling is necessary - barrier_packet_.completion_signal = - Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); + // Get active signal for current dispatch if profiling is necessary + barrier_packet_.completion_signal = + Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); + } } while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); @@ -1218,7 +1202,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent(); if (hw_event != nullptr) { Barriers().AddExternalSignal(reinterpret_cast(hw_event)); - } else if (static_cast(*it)->queue() != command.queue()) { + } else if (static_cast(*it)->queue() != command.queue() && + ((*it)->status() != CL_COMPLETE)) { LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it); } else { // Assume serialization on the same queue... @@ -1239,10 +1224,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) { timestamp_->end(); } command.setData(timestamp_); - // Update HW event only for batches - if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) { - command.SetHwEvent(timestamp_->Signals().back()); - } + timestamp_ = nullptr; } } @@ -2926,13 +2908,46 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { // ================================================================================================ void VirtualGPU::submitMarker(amd::Marker& vcmd) { - if (vcmd.profilingInfo().marker_ts_) { + if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) { profilingBegin(vcmd); if (timestamp_ != nullptr) { + ProfilingSignal* prof_signal = nullptr; + // If direct dispatch is enabled and the batch head isn't null, then it's a marker and + // requires the batch update upon HSA signal completion + if (AMD_DIRECT_DISPATCH) { + assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!"); + + prof_signal = dev().GetGlobalSignal(timestamp_); + prof_signal->done_ = false; + + assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!"); + uint32_t init_value = kInitSignalValueOne; + // If API callback is enabled, then use a blocking signal for AQL queue. + // HSA signal will be acquired in SW and released after HSA signal callback + if (vcmd.Callback() != nullptr) { + timestamp_->SetCallbackSignal(prof_signal->signal_); + // Blocks AQL queue from further processing + hsa_signal_add_relaxed(prof_signal->signal_, 1); + init_value += 1; + } + + hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_, + HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_); + if (HSA_STATUS_SUCCESS != result) { + LogError("hsa_amd_signal_async_handler() failed to set the handler!"); + } else { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)", + prof_signal->signal_.handle, prof_signal); + } + // Update HW event only for batches + vcmd.SetHwEvent(timestamp_->Signals().back()); + } // Submit a barrier with a cache flushes. - dispatchBarrierPacket(kBarrierPacketHeader); - // Reset this flag since we already enable system scope for kBarrierPacketHeader - hasPendingDispatch_ = false; + dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal); + + // Don't reset the flag for direct dispatch, because the global signals are out of scope + // for internal barrier tracking and SDMA could lose a wait for compute + hasPendingDispatch_ = AMD_DIRECT_DISPATCH; } profilingEnd(vcmd); } @@ -2958,45 +2973,13 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { // ================================================================================================ void VirtualGPU::flush(amd::Command* list, bool wait) { - // Direct dispatch relies on HSA signal callback - bool skip_cpu_wait = AMD_DIRECT_DISPATCH; + // If barrier is requested, then wait for everything, otherwise + // a per disaptch wait will occur later in updateCommandsState() + releaseGpuMemoryFence(); + updateCommandsState(list); - if (skip_cpu_wait) { - // Search for the last command in the batch to track GPU state - amd::Command* current = list; - assert(current != nullptr && "Empty batch for processing!"); - - // Find the last command - while (current->getNext() != nullptr) { - current = current->getNext(); - } - // Always insert a barrier. Some tests rquire async SDMA wait - hasPendingDispatch_ = true; - // Enable profiling, so runtime can track TS - profilingBegin(*current); - - // If runtime didn't submit a barrier, then it can't track the completion of the batch. - // Hence runtime either has to insert a barrier unconditionally or have a CPU wait. - // Due to performance impact of extra barriers CPU wait is selected. - // Note: if callback will be selected to update the batch status, - // then the host thread can't update it also, otherwise double free may occur - skip_cpu_wait &= hasPendingDispatch_; - - releaseGpuMemoryFence(skip_cpu_wait); - profilingEnd(*current); - } else { - // If barrier is requested, then wait for everything, otherwise - // a per disaptch wait will occur later in updateCommandsState() - releaseGpuMemoryFence(); - } - - // If CPU waited for GPU, then the queue is idle - if (!skip_cpu_wait) { - updateCommandsState(list); - - // Release all pinned memory - releasePinnedMem(); - } + // Release all pinned memory + releasePinnedMem(); } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index aaa05acf67..64f556e412 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -34,23 +34,9 @@ namespace roc { class Device; class Memory; +struct ProfilingSignal; class Timestamp; -struct ProfilingSignal : public amd::HeapObject { - amd::Monitor lock_; //!< Signal lock for update - hsa_signal_t signal_; //!< HSA signal to track profiling information - Timestamp* ts_; //!< Timestamp object associated with the signal - HwQueueEngine engine_; //!< Engine used with this signal - bool done_; //!< True if signal is done - ProfilingSignal() - : lock_("Signal Ops Lock", true) - , ts_(nullptr) - , engine_(HwQueueEngine::Compute) - , done_(true) - { signal_.handle = 0; } - amd::Monitor& LockSignalOps() { return lock_; } -}; - // Initial HSA signal value constexpr static hsa_signal_value_t kInitSignalValueOne = 1; @@ -139,7 +125,7 @@ class Timestamp : public amd::HeapObject { const bool HwProfiling() const { return !signals_.empty(); } //! Finds execution ticks on GPU - void checkGpuTime(bool event_recycle = false); + void checkGpuTime(); // Start a timestamp (get timestamp from OS) void start() { start_ = amd::Os::timeNanos(); } @@ -225,8 +211,6 @@ class VirtualGPU : public device::VirtualDevice { //! Wait for the curent active signal. Can idle the queue bool WaitCurrent() { ProfilingSignal* signal = signal_list_[current_id_]; - ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitCurret completion_signal=0x%zx", - std::this_thread::get_id(), signal->signal_.handle); return CpuWaitForSignal(signal); } @@ -253,8 +237,6 @@ class VirtualGPU : public device::VirtualDevice { void WaitNext() { size_t next = (current_id_ + 1) % signal_list_.size(); ProfilingSignal* signal = signal_list_[next]; - ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitNext completion_signal=0x%zx", - std::this_thread::get_id(), signal->signal_.handle); CpuWaitForSignal(signal); } @@ -396,7 +378,8 @@ class VirtualGPU : public device::VirtualDevice { template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size = 1); - void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false); + void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false, + const ProfilingSignal* global_signal = nullptr); bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet, diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 082deaacc9..19c29cce6b 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -48,6 +48,7 @@ Event::Event(HostQueue& queue) status_(CL_INT_MAX), hw_event_(nullptr), notify_event_(nullptr), + device_(&queue.device()), profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) || Agent::shouldPostEventEvents()) { notified_.clear(); @@ -55,7 +56,7 @@ Event::Event(HostQueue& queue) // ================================================================================================ Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED), - hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); } + hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); } // ================================================================================================ Event::~Event() { @@ -69,6 +70,10 @@ Event::~Event() { if (notify_event_ != nullptr) { notify_event_->release(); } + // Destroy global HW event if available + if ((hw_event_ != nullptr) && (device_ != nullptr)) { + device_->ReleaseGlobalSignal(hw_event_); + } } // ================================================================================================ @@ -259,21 +264,35 @@ bool Event::awaitCompletion() { // ================================================================================================ bool Event::notifyCmdQueue() { HostQueue* queue = command().queue(); - if ((status() > CL_COMPLETE) && (nullptr != queue) && - (!AMD_DIRECT_DISPATCH || - // If HW event was assigned, then notification can be ignored, since a barrier was issued - (HwEvent() == nullptr)) && - !notified_.test_and_set()) { - // Make sure the queue is draining the enqueued commands. - amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this); - if (command == NULL) { - notified_.clear(); - return false; + if (AMD_DIRECT_DISPATCH) { + ScopedLock l(lock_); + if ((status() > CL_COMPLETE) && (nullptr != queue) && + // If HW event was assigned, then notification can be ignored, since a barrier was issued + (HwEvent() == nullptr) && + !notified_.test_and_set()) { + // Make sure the queue is draining the enqueued commands. + amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this); + if (command == NULL) { + notified_.clear(); + return false; + } + ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue); + command->enqueue(); + // Save notification, associated with the current event + notify_event_ = command; + } + } else { + if ((status() > CL_COMPLETE) && (nullptr != queue) && !notified_.test_and_set()) { + // Make sure the queue is draining the enqueued commands. + amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this); + if (command == NULL) { + notified_.clear(); + return false; + } + ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue); + command->enqueue(); + command->release(); } - ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue); - command->enqueue(); - // Save notification, associated with the current event - notify_event_ = command; } return true; } @@ -318,6 +337,7 @@ void Command::enqueue() { // update will occur later after flush() with a wait if (AMD_DIRECT_DISPATCH) { setStatus(CL_QUEUED); + // Notify all commands about the waiter. Barrier will be sent in order to obtain // HSA signal for a wait on the current queue std::for_each(eventWaitList().begin(), eventWaitList().end(), @@ -333,13 +353,10 @@ void Command::enqueue() { // Update batch head for the current marker. Hence the status of all commands can be // updated upon the marker completion SetBatchHead(queue_->GetSubmittionBatch()); - if (profilingInfo().marker_ts_) { - setStatus(CL_SUBMITTED); - submit(*queue_->vdev()); - } else { - // Flush the current batch, but skip the wait on CPU if possible to avoid a stall - queue_->vdev()->flush(queue_->GetSubmittionBatch()); - } + + setStatus(CL_SUBMITTED); + submit(*queue_->vdev()); + // The batch will be tracked with the marker now queue_->ResetSubmissionBatch(); } else { diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index ec9843a1bf..02945b19ad 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -95,6 +95,7 @@ class Event : public RuntimeObject { std::atomic_flag notified_; //!< Command queue was notified void* hw_event_; //!< HW event ID associated with SW event Event* notify_event_; //!< Notify event, which should contain HW signal + const Device* device_; //!< Device, this event associated with protected: static const EventWaitList nullWaitList;