diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 815aee9fd5..f29df7838a 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -64,6 +64,7 @@ hipError_t Event::query() { return ready() ? hipSuccess : hipErrorNotReady; } +// ================================================================================================ hipError_t Event::synchronize() { amd::ScopedLock lock(lock_); @@ -76,19 +77,12 @@ hipError_t Event::synchronize() { // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status static constexpr bool kWaitCompletion = true; if (!hip_device->devices()[0]->IsHwEventReady(*event_, kWaitCompletion, flags_)) { - if (event_->HwEvent() != nullptr) { - amd::Command* command = nullptr; - hipError_t status = recordCommand(command, event_->command().queue(), flags_); - command->enqueue(); - hip_device->devices()[0]->IsHwEventReady(command->event(), kWaitCompletion, flags_); - command->release(); - } else { - event_->awaitCompletion(); - } + event_->awaitCompletion(); } return hipSuccess; } +// ================================================================================================ bool Event::awaitEventCompletion() { return event_->awaitCompletion(); } @@ -222,8 +216,9 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) { return hipSuccess; } +// ================================================================================================ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, - uint32_t ext_flags ) { + uint32_t ext_flags, bool batch_flush) { if (command == nullptr) { int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) & (hipEventReleaseToDevice | hipEventReleaseToSystem | @@ -234,11 +229,12 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, releaseFlags = amd::Device::kCacheStateInvalid; } // Always submit a EventMarker. - command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags); + command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush); } return hipSuccess; } +// ================================================================================================ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) { command->enqueue(); if (event_ == &command->event()) return hipSuccess; @@ -251,11 +247,13 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command return hipSuccess; } -hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) { +// ================================================================================================ +hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, + bool record, bool batch_flush) { hip::Stream* hip_stream = hip::getStream(stream); // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847 amd::ScopedLock lock(lock_); - hipError_t status = recordCommand(command, hip_stream); + hipError_t status = recordCommand(command, hip_stream, 0, batch_flush); if (status != hipSuccess) { return hipSuccess; } @@ -415,7 +413,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) { if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) { return hipErrorInvalidHandle; } - status = e->addMarker(stream, nullptr, true); + status = e->addMarker(stream, nullptr, true, !hip::Event::kBatchFlush); } return status; } diff --git a/projects/clr/hipamd/src/hip_event.hpp b/projects/clr/hipamd/src/hip_event.hpp index 9ebb3cb3c6..2b75ab869e 100644 --- a/projects/clr/hipamd/src/hip_event.hpp +++ b/projects/clr/hipamd/src/hip_event.hpp @@ -81,10 +81,11 @@ typedef struct ihipIpcEventShmem_s { class EventMarker : public amd::Marker { public: EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false, - int32_t scope = amd::Device::kCacheStateInvalid) + int32_t scope = amd::Device::kCacheStateInvalid, bool batch_flush = true) : amd::Marker(stream, disableFlush) { profilingInfo_.enabled_ = true; profilingInfo_.marker_ts_ = markerTs; + profilingInfo_.batch_flush_ = batch_flush; profilingInfo_.clear(); setEventScope(scope); } @@ -101,6 +102,8 @@ class Event { } public: + constexpr static bool kBatchFlush = true; //!< Flushes CPU command batch in direct dispatch mode + Event(uint32_t flags) : flags_(flags), lock_(true) /* hipEvent_t lock*/, event_(nullptr), unrecorded_(false), stream_(nullptr) { // No need to init event_ here as addMarker does that @@ -123,9 +126,10 @@ class Event { virtual hipError_t streamWait(hipStream_t stream, uint flags); virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream, - uint32_t flags = 0); + uint32_t flags = 0, bool batch_flush = true); virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record); - hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record); + hipError_t addMarker(hipStream_t stream, amd::Command* command, + bool record, bool batch_flush = true); void BindCommand(amd::Command& command, bool record) { amd::ScopedLock lock(lock_); @@ -217,7 +221,8 @@ class IPCEvent : public Event { hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command); hipError_t streamWait(hipStream_t stream, uint flags); - hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0); + hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, + uint32_t flags = 0, bool batch_flush = true) override; hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record); }; diff --git a/projects/clr/hipamd/src/hip_event_ipc.cpp b/projects/clr/hipamd/src/hip_event_ipc.cpp index 9c68e9f669..c7204619cd 100644 --- a/projects/clr/hipamd/src/hip_event_ipc.cpp +++ b/projects/clr/hipamd/src/hip_event_ipc.cpp @@ -139,16 +139,19 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) { return hipSuccess; } -hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) { +// ================================================================================================ +hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, + uint32_t flags, bool batch_flush) { bool unrecorded = isUnRecorded(); if (unrecorded) { command = new amd::Marker(*stream, kMarkerDisableFlush); } else { - return Event::recordCommand(command, stream); + return Event::recordCommand(command, stream, batch_flush); } return hipSuccess; } +// ================================================================================================ hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) { bool unrecorded = isUnRecorded(); if (unrecorded) { diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index 91f0472112..40ffc6ce2c 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -23,6 +23,7 @@ #include "hip_event.hpp" #include "thread/monitor.hpp" #include "hip_prof_api.h" +#include namespace hip { @@ -358,11 +359,15 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) { } } bool wait = (stream == nullptr || stream == hipStreamLegacy) ? true : false; - constexpr bool kDontWaitForCpu = false; - auto hip_stream = hip::getStream(stream, wait); + bool wait_for_cpu = false; + // Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands + if (DEBUG_HIP_BLOCK_SYNC != 0) { + static std::atomic flush = 0; + wait_for_cpu = ((++flush % DEBUG_HIP_BLOCK_SYNC) == 0) ? true : false; + } // Wait for the current host queue - hip_stream->finish(kDontWaitForCpu); + hip_stream->finish(wait_for_cpu); if (stream == nullptr) { // null stream will sync with other streams. ReleaseGraphExec(hip_stream->DeviceId()); diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index ef82630325..160eb84caa 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1323,9 +1323,6 @@ class VirtualDevice : public amd::HeapObject { //! Returns true if device has active wait setting bool ActiveWait() const; - //! Returns the status of queue handler callback - virtual bool isHandlerPending() const = 0; - //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; //! Init hidden heap for device memory allocations diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 1d1ac66cbe..05258e961c 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -353,8 +353,6 @@ class VirtualGPU : public device::VirtualDevice { void profilerAttach(bool enable = false) {} - bool isHandlerPending() const { return false; } - bool isFenceDirty() const { return false; } void HiddenHeapInit() {} diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index d18e3c4c5c..23a2fd22d9 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -127,6 +127,9 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) { // ================================================================================================ void Timestamp::checkGpuTime() { + if (amd::IS_HIP && !amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) { + return; + } amd::ScopedLock s(lock_); if (HwProfiling()) { uint64_t start = std::numeric_limits::max(); @@ -479,12 +482,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)", prof_signal->signal_.handle, prof_signal); } - SetHandlerPending(false); // Update the current command/marker with HW event prof_signal->retain(); ts->command().SetHwEvent(prof_signal); } else if (ts->command().profilingInfo().marker_ts_) { - SetHandlerPending(true); // Update the current command/marker with HW event prof_signal->retain(); ts->command().SetHwEvent(prof_signal); @@ -1652,7 +1653,7 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const { // also true for any command B, which falls between A and C. current = list; while (current != nullptr) { - if (current->profilingInfo().enabled_) { + if (current->profilingInfo().enabled_) { if (!current->data().empty()) { for (auto i = 0; i < current->data().size(); i++) { // Since this is a valid command to get a timestamp, we use the diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 3aeebe02b4..d015b321d2 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -248,7 +248,7 @@ class VirtualGPU : public device::VirtualDevice { class HwQueueTracker : public amd::EmbeddedObject { public: - HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu), handlerPending_(false) {} + HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {} ~HwQueueTracker(); @@ -289,12 +289,6 @@ class VirtualGPU : public device::VirtualDevice { //! Empty check for external signals bool IsExternalSignalListEmpty() const { return external_signals_.empty(); } - //! Set the status to indicate a pending handler - void SetHandlerPending(bool pending) { handlerPending_ = pending; } - - //! Check if callback has been queued - bool IsHandlerPending() const { return handlerPending_; } - //! Get/Set SDMA profiling bool GetSDMAProfiling() { return sdma_profiling_; } void SetSDMAProfiling(bool profile) { @@ -319,7 +313,6 @@ class VirtualGPU : public device::VirtualDevice { const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker std::vector external_signals_; //!< External signals for a wait in this queue std::vector waiting_signals_; //!< Current waiting signals in this queue - bool handlerPending_; //!< This indicates if we have queued a callback handler }; VirtualGPU(Device& device, bool profiling = false, bool cooperative = false, @@ -434,10 +427,6 @@ class VirtualGPU : public device::VirtualDevice { Timestamp* timestamp() const { return timestamp_; } - //! Indicates the status of the callback handler. The callback would process the commands - //! and would collect profiling data, update refcounts - bool isHandlerPending() const { return barriers_.IsHandlerPending(); } - void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } void HiddenHeapInit(); diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index a4133bb957..7dd5aa27f4 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -366,7 +366,9 @@ void Command::enqueue() { ScopedLock sl(queue_->vdev()->execution()); queue_->FormSubmissionBatch(this); - if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) { + // Enqueue flushes, except profiling markers to avoid frequent expensive callbacks + if (((type() == 0) && profilingInfo().batch_flush_) || + (type() == CL_COMMAND_MARKER) || (type() == CL_COMMAND_TASK)) { // The current HSA signal tracking logic requires profiling enabled for the markers EnableProfiling(); // Update batch head for the current marker. Hence the status of all commands can be diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 3bccbbd2ea..e825bfff98 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -114,6 +114,7 @@ class Event : public RuntimeObject { uint64_t correlation_id_; bool enabled_; //!< Profiling enabled for the wave limiter bool marker_ts_; //!< TS marker + bool batch_flush_ = true; //!< Command can flush the batch in direct dispatch mode void clear() { queued_ = 0ULL; diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index bc38121273..33ee53c8c8 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -65,6 +65,15 @@ bool HostQueue::terminate() { // destroyed. Command* lastCommand = getLastQueuedCommand(true); if (lastCommand != nullptr) { + // Check if CPU batch wasn't flushed for completion with the last command + if (GetSubmittionBatch() != nullptr) { + auto command = new Marker(*this, false); + if (command != nullptr) { + ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish"); + command->enqueue(); + lastCommand = command; + } + } lastCommand->awaitCompletion(); // Note that if lastCommand isn't a marker, it may not be lastEnqueueCommand_ now // after lastCommand->awaitCompletion() is called. @@ -128,19 +137,13 @@ void HostQueue::finish(bool cpu_wait) { if (IS_HIP) { command = getLastQueuedCommand(true); if (command == nullptr) { + assert(GetSubmittionBatch() == nullptr && + "Can't claim the queue is finished with the active batch!"); return; } } - // If command doesn't contain HW event and runtime didn't request CPU wait, - // then force marker submit - bool force_marker = false; - if (AMD_DIRECT_DISPATCH && (command != nullptr) && !cpu_wait) { - void* hw_event = - (command->NotifyEvent() != nullptr) ? command->NotifyEvent()->HwEvent() : command->HwEvent(); - force_marker = (hw_event == nullptr); - } - if (nullptr == command || force_marker || - vdev()->isHandlerPending() || vdev()->isFenceDirty()) { + // Force marker if the batch wasn't sent for CPU update or fence is dirty + if (nullptr == command || (GetSubmittionBatch() != nullptr) || vdev()->isFenceDirty()) { if (nullptr != command) { command->release(); } diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index c057a8cd6c..0f50070442 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -257,6 +257,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \ "Forces the number of streams for the graph parallel execution") \ release(bool, HIP_ALWAYS_USE_NEW_COMGR_UNBUNDLING_ACTION, false, \ "Force to always use new comgr unbundling action") \ +release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \ + "Blocks synchronization on CPU until the callback processing is done")\ release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true, \ "Enable/Disable multiple kern arg copies") \ release(bool, DEBUG_CLR_USE_STDMUTEX_IN_AMD_MONITOR, false, \