From 6966d8098ecbc713165c5b849c0073a4480c3dff Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Wed, 3 Feb 2021 10:13:22 -0500 Subject: [PATCH] SWDEV-269654 - Fix HIP stream busy query - Avoid GPU wait on the marker submission and update the command batch after HSA signal callback upon HSA barrier completion. Change-Id: I5c1c97212aefc2ae4b99aa9e2a81627ee9a38c1c --- rocclr/device/rocm/rocdevice.cpp | 3 + rocclr/device/rocm/rocvirtual.cpp | 116 ++++++++++++++++++++++++------ rocclr/device/rocm/rocvirtual.hpp | 29 +++++--- rocclr/platform/command.cpp | 27 +++++-- rocclr/platform/command.hpp | 39 +++++----- rocclr/platform/commandqueue.cpp | 2 +- 6 files changed, 158 insertions(+), 58 deletions(-) diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 4c501cba5f..53e8958b5d 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -2361,6 +2361,7 @@ void Device::svmFree(void* ptr) const { } } +// ================================================================================================ VirtualGPU* Device::xferQueue() const { if (!xferQueue_) { // Create virtual device for internal memory transfer @@ -2368,12 +2369,14 @@ VirtualGPU* Device::xferQueue() const { thisDevice->xferQueue_ = reinterpret_cast(thisDevice->createVirtualDevice()); if (!xferQueue_) { LogError("Couldn't create the device transfer manager!"); + return nullptr; } } xferQueue_->enableSyncBlit(); return xferQueue_; } +// ================================================================================================ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { bool result = true; diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 0cd5017214..28b75c9027 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -114,7 +114,7 @@ void Timestamp::checkGpuTime() { } hsa_amd_profiling_dispatch_time_t time = {}; if (it->engine_ == HwQueueEngine::Compute) { - hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time); + hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time); } else { hsa_amd_profiling_async_copy_time_t time_sdma = {}; hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma); @@ -132,6 +132,27 @@ void Timestamp::checkGpuTime() { } } +// ================================================================================================ +bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) { + Timestamp* ts = reinterpret_cast(arg); + amd::Thread* thread = amd::Thread::current(); + if (!(thread != nullptr || + ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) { + return false; + } + // Update the batch, since signal is complete + if (ts->Signals().size() > 0) { + amd::ScopedLock sl(ts->gpu()->execution()); + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n", + static_cast(value), arg, ts->Signals()[0]->signal_.handle); + ts->gpu()->updateCommandsState(ts->command().GetBatchHead()); + } else { + LogError("Error: ROCr handler was called for untracked signal!"); + } + // Return false, so the callback will not be called again for this signal + return false; +} + // ================================================================================================ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { if (numMemObj > 0) { @@ -298,19 +319,32 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // a GPU waiter(which may be not triggered yet) and CPU signal reset below WaitNext(); + ProfilingSignal* prof_signal = signal_list_[current_id_]; // Reset the signal and return - hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val); - signal_list_[current_id_]->done_ = false; - signal_list_[current_id_]->engine_ = engine_; + hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val); + prof_signal->done_ = false; + prof_signal->engine_ = engine_; if (ts != 0) { + // If direct dispatch is enabled and the batch head isn't null, then it's a marker and + // requires the batch update upon HSA signal completion + if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) { + hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_, + HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, &HsaAmdSignalHandler, ts); + if (HSA_STATUS_SUCCESS != result) { + LogError("hsa_amd_signal_async_handler() failed to set the handler!"); + } else { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(%lx), timestamp(%p)\n", + prof_signal->signal_.handle, prof_signal); + } + } if (!sdma_profiling_) { hsa_amd_profiling_async_copy_enable(true); sdma_profiling_ = true; } - signal_list_[current_id_]->ts_ = ts; - ts->AddProfilingSignal(signal_list_[current_id_]); + prof_signal->ts_ = ts; + ts->AddProfilingSignal(prof_signal); } - return signal_list_[current_id_]->signal_; + return prof_signal->signal_; } // ================================================================================================ @@ -905,8 +939,10 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, VirtualGPU::~VirtualGPU() { delete blitMgr_; - // Release the resources of signal - releaseGpuMemoryFence(); + if (tracking_created_) { + // Release the resources of signal + releaseGpuMemoryFence(); + } destroyPool(); @@ -994,7 +1030,8 @@ bool VirtualGPU::create() { } // Allocate signal tracker for ROCr copy queue - if (!Barriers().Create()) { + tracking_created_ = Barriers().Create(); + if (!tracking_created_) { LogError("Could not create signal for copy queue!"); return false; } @@ -1057,7 +1094,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { return; } // Without barrier profiling will wait for each individual signal - timestamp_ = new Timestamp(dev().getBackendDevice()); + timestamp_ = new Timestamp(this, command); timestamp_->start(); } } @@ -1073,7 +1110,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) { if (!timestamp_->HwProfiling()) { timestamp_->end(); } - command.setData(reinterpret_cast(timestamp_)); + command.setData(timestamp_); timestamp_ = nullptr; } } @@ -1378,6 +1415,8 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) { // ================================================================================================ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); #if AMD_HMM_SUPPORT profilingBegin(cmd); // Initialize signal for the barrier @@ -2582,6 +2621,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // Get device queue for exclusive GPU access VirtualGPU* queue = dev().xferQueue(); + if (!queue) { + LogError("Runtime failed to acquire a cooperative queue!"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } // Lock the queue, using the blit manager lock amd::ScopedLock lock(queue->blitMgr().lockXfer()); @@ -2680,20 +2724,46 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { // ================================================================================================ void VirtualGPU::flush(amd::Command* list, bool wait) { - // If barrier is requested, then wait for everything, otherwise - // a per disaptch wait will occur later in updateCommandsState() - if (dev().settings().barrier_sync_) { - releaseGpuMemoryFence(); - } - updateCommandsState(list); + // Direct dispatch relies on HSA signal callback + bool skip_cpu_wait = AMD_DIRECT_DISPATCH; - // Add extra clean up for resources if releaseGpuMemoryFence() was skipped - if (!dev().settings().barrier_sync_) { - ResetQueueStates(); + if (skip_cpu_wait) { + // Search for the last command in the batch to track GPU state + amd::Command* current = list; + while (current->getNext() != nullptr) { + current = current->getNext(); + } + // Enable profiling, so runtime can track TS + profilingBegin(*current); + + // If runtime didn't submit a barrier, then it can't track the completion of the batch. + // Hence runtime either has to insert a barrier unconditionally or have a CPU wait. + // Due to performance impact of extra barriers CPU wait is selected. + // CPU wait is also forced if pinned buffers were used + skip_cpu_wait &= hasPendingDispatch_ && (pinnedMems_.size() == 0); + + releaseGpuMemoryFence(kIgnoreBarrier, skip_cpu_wait); + profilingEnd(*current); + } else { + // If barrier is requested, then wait for everything, otherwise + // a per disaptch wait will occur later in updateCommandsState() + if (dev().settings().barrier_sync_) { + releaseGpuMemoryFence(); + } } - // Release all pinned memory - releasePinnedMem(); + // If CPU waited for GPU, then the queue is idle + if (!skip_cpu_wait) { + updateCommandsState(list); + + // Add extra clean up for resources if releaseGpuMemoryFence() was skipped + if (!dev().settings().barrier_sync_) { + ResetQueueStates(); + } + + // Release all pinned memory + releasePinnedMem(); + } } // ================================================================================================ diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 029fbb6240..cc4c96c635 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -70,16 +70,25 @@ inline bool WaitForSignal(hsa_signal_t signal) { // Timestamp for keeping track of some profiling information for various commands // including EnqueueNDRangeKernel and clEnqueueCopyBuffer. -class Timestamp { +class Timestamp : public amd::HeapObject { private: static double ticksToTime_; uint64_t start_; uint64_t end_; - hsa_agent_t agent_; + VirtualGPU* gpu_; //!< Virtual GPU, associated with this timestamp + const amd::Command& command_; //!< Command, associated with this timestamp std::vector signals_; public: + Timestamp(VirtualGPU* gpu, const amd::Command& command) + : start_(std::numeric_limits::max()) + , end_(0) + , gpu_(gpu) + , command_(command) {} + + virtual ~Timestamp() {} + uint64_t getStart() { checkGpuTime(); return start_; @@ -92,15 +101,10 @@ class Timestamp { void AddProfilingSignal(ProfilingSignal* signal) { signals_.push_back(signal); } + const std::vector& Signals() const { return signals_; } + const bool HwProfiling() const { return !signals_.empty(); } - Timestamp(hsa_agent_t agent) - : start_(std::numeric_limits::max()) - , end_(0) - , agent_(agent) {} - - ~Timestamp() {} - //! Finds execution ticks on GPU void checkGpuTime(); @@ -112,6 +116,12 @@ class Timestamp { static void setGpuTicksToTime(double ticksToTime) { ticksToTime_ = ticksToTime; } static double getGpuTicksToTime() { return ticksToTime_; } + + //! Returns amd::command assigned to this timestamp + const amd::Command& command() const { return command_; } + + //! Returns virtual GPU device, used with this timestamp + VirtualGPU* gpu() const { return gpu_; } }; class VirtualGPU : public device::VirtualDevice { @@ -382,6 +392,7 @@ class VirtualGPU : public device::VirtualDevice { uint32_t addSystemScope_ : 1; //!< Insert a system scope to the next aql uint32_t isLastCommandSDMA_ : 1; //!< Keep track if the last command was SDMA and //!< not send Barrier packets if barrier_sync is 0 + uint32_t tracking_created_ : 1; //!< Enabled if tracking object was properly initialized }; uint32_t state_; }; diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index 86444ed717..8bd79a8cc1 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -243,14 +243,14 @@ Command* Event::notifyCmdQueue(bool retain) { const Event::EventWaitList Event::nullWaitList(0); +// ================================================================================================ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, uint32_t commandWaitBits) : Event(queue), queue_(&queue), - next_(NULL), + next_(nullptr), type_(type), - exception_(0), - data_(NULL), + data_(nullptr), eventWaitList_(eventWaitList), commandWaitBits_(commandWaitBits) { // Retain the commands from the event wait list. @@ -258,6 +258,7 @@ Command::Command(HostQueue& queue, cl_command_type type, if (type != 0) activity_.Initialize(type, queue.vdev()->index(), queue.device().index()); } +// ================================================================================================ void Command::releaseResources() { const Command::EventWaitList& events = eventWaitList(); @@ -265,6 +266,7 @@ void Command::releaseResources() { std::for_each(events.begin(), events.end(), std::mem_fun(&Command::release)); } +// ================================================================================================ void Command::enqueue() { assert(queue_ != NULL && "Cannot be enqueued"); @@ -277,16 +279,28 @@ void Command::enqueue() { // Direct dispatch logic below will submit the command immediately, but the command status // update will occur later after flush() with a wait if (AMD_DIRECT_DISPATCH) { + setStatus(CL_QUEUED); + // The wait should be performed before the lock, + // otherwise signal handler may have a deadlock, but awaitCompletion() is thread safe itself + std::for_each(eventWaitList().begin(), eventWaitList().end(), + std::mem_fun(&Command::awaitCompletion)); + // The batch update must be lock protected to avoid a race condition // when multiple threads submit/flush/update the batch at the same time - ScopedLock sl(queue_->lock()); + ScopedLock sl(queue_->vdev()->execution()); queue_->FormSubmissionBatch(this); if ((type() == CL_COMMAND_MARKER || type() == 0) && !profilingInfo().marker_ts_) { - // Flush the current batch and wait for the results, since it's a marker. - // @todo: The condition requires a reevaluation to determine if any marker needs a wait. + // The current HSA signal tracking logic requires profiling enabled for the markers + EnableProfiling(); + // Update batch head for the current marker. Hence the status of all commands can be + // updated upon the marker completion + SetBatchHead(queue_->GetSubmittionBatch()); + // Flush the current batch, but skip the wait on CPU if possible to avoid a stall queue_->vdev()->flush(queue_->GetSubmittionBatch()); + // The batch will be tracked with the marker now queue_->ResetSubmissionBatch(); } else { + setStatus(CL_SUBMITTED); submit(*queue_->vdev()); } } else { @@ -299,6 +313,7 @@ void Command::enqueue() { } } +// ================================================================================================ const Context& Command::context() const { return queue_->context(); } NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 23d23804c7..8e6a78ce96 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -127,7 +127,6 @@ class Event : public RuntimeObject { clear(); callback_ = callback; } - } profilingInfo_; activity_prof::ActivityProf activity_; //!< Activity profiling @@ -154,6 +153,13 @@ class Event : public RuntimeObject { void waitForCompletion(); + //! Enable profiling for this command + void EnableProfiling() { + profilingInfo_.enabled_ = true; + profilingInfo_.clear(); + profilingInfo_.callback_ = nullptr; + } + public: //! Return the context for this event. virtual const Context& context() const = 0; @@ -207,17 +213,13 @@ class Event : public RuntimeObject { * submitted to a HostQueue for execution. Classes derived from * %Command must implement the submit() function. * - */ class Command : public Event { private: - //! The command queue this command is enqueue into. NULL if not yet enqueue. - HostQueue* queue_; - //! Next GPU command in the queue list - Command* next_; - - cl_command_type type_; //!< This command's OpenCL type. - volatile int32_t exception_; //!< The first raised exception. + HostQueue* queue_; //!< The command queue this command is enqueue into + Command* next_; //!< Next GPU command in the queue list + Command* batch_head_ = nullptr; //!< The head of the batch commands + cl_command_type type_; //!< This command's OpenCL type. void* data_; protected: @@ -235,11 +237,10 @@ class Command : public Event { //! Construct a new command of the given OpenCL type. Command(cl_command_type type) : Event(), - queue_(NULL), - next_(NULL), + queue_(nullptr), + next_(nullptr), type_(type), - exception_(0), - data_(NULL), + data_(nullptr), eventWaitList_(nullWaitList), commandWaitBits_(0) {} @@ -267,12 +268,6 @@ class Command : public Event { //! Return this command's OpenCL type. cl_command_type type() const { return type_; } - //! Return the first raised exception or 0 if none. - int32_t exception() const { return exception_; } - - //! Set the exception for this command. - void setException(int32_t exception) { exception_ = exception; } - //! Return the opaque, device specific data for this command. void* data() const { return data_; } @@ -303,6 +298,12 @@ class Command : public Event { uint32_t getWaitBits() const { return commandWaitBits_; } void OverrrideCommandType(cl_command_type type) { type_ = type; } + + //! Updates the batch head, associated with this command(marker) + void SetBatchHead(Command* command) { batch_head_ = command; } + + //! Returns the current batch head + Command* GetBatchHead() const { return batch_head_; } }; class UserEvent : public Command { diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 4af38dd53b..2031ad44e8 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -227,7 +227,7 @@ Command* HostQueue::getLastQueuedCommand(bool retain) { if (AMD_DIRECT_DISPATCH) { // The batch update must be lock protected to avoid a race condition // when multiple threads submit/flush/update the batch at the same time - ScopedLock sl(lock()); + ScopedLock sl(vdev()->execution()); // Since the lastCmdLock_ is acquired, it is safe to read and retain the lastEnqueueCommand. // It is guaranteed that the pointer will not change.