From 6966d8098ecbc713165c5b849c0073a4480c3dff Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Wed, 3 Feb 2021 10:13:22 -0500
Subject: [PATCH] SWDEV-269654 - Fix HIP stream busy query

- Avoid GPU wait on the marker submission and update the command
batch after HSA signal callback upon HSA barrier completion.

Change-Id: I5c1c97212aefc2ae4b99aa9e2a81627ee9a38c1c
---
 rocclr/device/rocm/rocdevice.cpp  |   3 +
 rocclr/device/rocm/rocvirtual.cpp | 116 ++++++++++++++++++++++++------
 rocclr/device/rocm/rocvirtual.hpp |  29 +++++---
 rocclr/platform/command.cpp       |  27 +++++--
 rocclr/platform/command.hpp       |  39 +++++-----
 rocclr/platform/commandqueue.cpp  |   2 +-
 6 files changed, 158 insertions(+), 58 deletions(-)

diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 4c501cba5f..53e8958b5d 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -2361,6 +2361,7 @@ void Device::svmFree(void* ptr) const {
   }
 }
 
+// ================================================================================================
 VirtualGPU* Device::xferQueue() const {
   if (!xferQueue_) {
     // Create virtual device for internal memory transfer
@@ -2368,12 +2369,14 @@ VirtualGPU* Device::xferQueue() const {
     thisDevice->xferQueue_ = reinterpret_cast<VirtualGPU*>(thisDevice->createVirtualDevice());
     if (!xferQueue_) {
       LogError("Couldn't create the device transfer manager!");
+      return nullptr;
     }
   }
   xferQueue_->enableSyncBlit();
   return xferQueue_;
 }
 
+// ================================================================================================
 bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
   cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
   bool result = true;
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 0cd5017214..28b75c9027 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -114,7 +114,7 @@ void Timestamp::checkGpuTime() {
       }
       hsa_amd_profiling_dispatch_time_t time = {};
       if (it->engine_ == HwQueueEngine::Compute) {
-        hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
+        hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
       } else {
         hsa_amd_profiling_async_copy_time_t time_sdma = {};
         hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
@@ -132,6 +132,27 @@ void Timestamp::checkGpuTime() {
   }
 }
 
+// ================================================================================================
+bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
+  Timestamp* ts = reinterpret_cast<Timestamp*>(arg);
+  amd::Thread* thread = amd::Thread::current();
+  if (!(thread != nullptr ||
+      ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) {
+    return false;
+  }
+  // Update the batch, since signal is complete
+  if (ts->Signals().size() > 0) {
+    amd::ScopedLock sl(ts->gpu()->execution());
+    ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n",
+        static_cast<uint32_t>(value), arg, ts->Signals()[0]->signal_.handle);
+    ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
+  } else {
+    LogError("Error: ROCr handler was called for untracked signal!");
+  }
+  // Return false, so the callback will not be called again for this signal
+  return false;
+}
+
 // ================================================================================================
 bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
   if (numMemObj > 0) {
@@ -298,19 +319,32 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
   // a GPU waiter(which may be not triggered yet) and CPU signal reset below
   WaitNext();
 
+  ProfilingSignal* prof_signal = signal_list_[current_id_];
   // Reset the signal and return
-  hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
-  signal_list_[current_id_]->done_ = false;
-  signal_list_[current_id_]->engine_ = engine_;
+  hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val);
+  prof_signal->done_ = false;
+  prof_signal->engine_ = engine_;
   if (ts != 0) {
+    // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
+    // requires the batch update upon HSA signal completion
+    if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
+      hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
+          HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, &HsaAmdSignalHandler, ts);
+      if (HSA_STATUS_SUCCESS != result) {
+        LogError("hsa_amd_signal_async_handler() failed to set the handler!");
+      } else {
+        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(%lx), timestamp(%p)\n",
+            prof_signal->signal_.handle, prof_signal);
+      }
+    }
     if (!sdma_profiling_) {
       hsa_amd_profiling_async_copy_enable(true);
       sdma_profiling_ = true;
     }
-    signal_list_[current_id_]->ts_ = ts;
-    ts->AddProfilingSignal(signal_list_[current_id_]);
+    prof_signal->ts_ = ts;
+    ts->AddProfilingSignal(prof_signal);
   }
-  return signal_list_[current_id_]->signal_;
+  return prof_signal->signal_;
 }
 
 // ================================================================================================
@@ -905,8 +939,10 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
 VirtualGPU::~VirtualGPU() {
   delete blitMgr_;
 
-  // Release the resources of signal
-  releaseGpuMemoryFence();
+  if (tracking_created_) {
+    // Release the resources of signal
+    releaseGpuMemoryFence();
+  }
 
   destroyPool();
 
@@ -994,7 +1030,8 @@ bool VirtualGPU::create() {
   }
 
   // Allocate signal tracker for ROCr copy queue
-  if (!Barriers().Create()) {
+  tracking_created_ = Barriers().Create();
+  if (!tracking_created_) {
     LogError("Could not create signal for copy queue!");
     return false;
   }
@@ -1057,7 +1094,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
       return;
     }
     // Without barrier profiling will wait for each individual signal
-    timestamp_ = new Timestamp(dev().getBackendDevice());
+    timestamp_ = new Timestamp(this, command);
     timestamp_->start();
   }
 }
@@ -1073,7 +1110,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
     if (!timestamp_->HwProfiling()) {
       timestamp_->end();
     }
-    command.setData(reinterpret_cast<void*>(timestamp_));
+    command.setData(timestamp_);
     timestamp_ = nullptr;
   }
 }
@@ -1378,6 +1415,8 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
 
 // ================================================================================================
 void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
+  // Make sure VirtualGPU has an exclusive access to the resources
+  amd::ScopedLock lock(execution());
 #if AMD_HMM_SUPPORT
   profilingBegin(cmd);
   // Initialize signal for the barrier
@@ -2582,6 +2621,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 
     // Get device queue for exclusive GPU access
     VirtualGPU* queue = dev().xferQueue();
+    if (!queue) {
+      LogError("Runtime failed to acquire a cooperative queue!");
+      vcmd.setStatus(CL_INVALID_OPERATION);
+      return;
+    }
 
     // Lock the queue, using the blit manager lock
     amd::ScopedLock lock(queue->blitMgr().lockXfer());
@@ -2680,20 +2724,46 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
 
 // ================================================================================================
 void VirtualGPU::flush(amd::Command* list, bool wait) {
-  // If barrier is requested, then wait for everything, otherwise
-  // a per disaptch wait will occur later in updateCommandsState()
-  if (dev().settings().barrier_sync_) {
-    releaseGpuMemoryFence();
-  }
-  updateCommandsState(list);
+  // Direct dispatch relies on HSA signal callback
+  bool skip_cpu_wait = AMD_DIRECT_DISPATCH;
 
-  // Add extra clean up for resources if releaseGpuMemoryFence() was skipped
-  if (!dev().settings().barrier_sync_) {
-    ResetQueueStates();
+  if (skip_cpu_wait) {
+    // Search for the last command in the batch to track GPU state
+    amd::Command* current = list;
+    while (current->getNext() != nullptr) {
+      current = current->getNext();
+    }
+    // Enable profiling, so runtime can track TS
+    profilingBegin(*current);
+
+    // If runtime didn't submit a barrier, then it can't track the completion of the batch.
+    // Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
+    // Due to performance impact of extra barriers CPU wait is selected.
+    // CPU wait is also forced if pinned buffers were used
+    skip_cpu_wait &= hasPendingDispatch_ && (pinnedMems_.size() == 0);
+
+    releaseGpuMemoryFence(kIgnoreBarrier, skip_cpu_wait);
+    profilingEnd(*current);
+  } else {
+    // If barrier is requested, then wait for everything, otherwise
+    // a per disaptch wait will occur later in updateCommandsState()
+    if (dev().settings().barrier_sync_) {
+      releaseGpuMemoryFence();
+    }
   }
 
-  // Release all pinned memory
-  releasePinnedMem();
+  // If CPU waited for GPU, then the queue is idle
+  if (!skip_cpu_wait) {
+      updateCommandsState(list);
+
+    // Add extra clean up for resources if releaseGpuMemoryFence() was skipped
+    if (!dev().settings().barrier_sync_) {
+      ResetQueueStates();
+    }
+
+    // Release all pinned memory
+    releasePinnedMem();
+  }
 }
 
 // ================================================================================================
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 029fbb6240..cc4c96c635 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -70,16 +70,25 @@ inline bool WaitForSignal(hsa_signal_t signal) {
 
 // Timestamp for keeping track of some profiling information for various commands
 // including EnqueueNDRangeKernel and clEnqueueCopyBuffer.
-class Timestamp {
+class Timestamp : public amd::HeapObject {
  private:
   static double ticksToTime_;
 
   uint64_t    start_;
   uint64_t    end_;
-  hsa_agent_t agent_;
+  VirtualGPU* gpu_;               //!< Virtual GPU, associated with this timestamp
+  const amd::Command& command_;   //!< Command, associated with this timestamp
   std::vector<ProfilingSignal*> signals_;
 
  public:
+  Timestamp(VirtualGPU* gpu, const amd::Command& command)
+    : start_(std::numeric_limits<uint64_t>::max())
+    , end_(0)
+    , gpu_(gpu)
+    , command_(command) {}
+
+  virtual ~Timestamp() {}
+
   uint64_t getStart() {
     checkGpuTime();
     return start_;
@@ -92,15 +101,10 @@ class Timestamp {
 
   void AddProfilingSignal(ProfilingSignal* signal) { signals_.push_back(signal); }
 
+  const std::vector<ProfilingSignal*>& Signals() const { return signals_; }
+
   const bool HwProfiling() const { return !signals_.empty(); }
 
-  Timestamp(hsa_agent_t agent)
-    : start_(std::numeric_limits<uint64_t>::max())
-    , end_(0)
-    , agent_(agent) {}
-
-  ~Timestamp() {}
-
   //! Finds execution ticks on GPU
   void checkGpuTime();
 
@@ -112,6 +116,12 @@ class Timestamp {
 
   static void setGpuTicksToTime(double ticksToTime) { ticksToTime_ = ticksToTime; }
   static double getGpuTicksToTime() { return ticksToTime_; }
+
+  //! Returns amd::command assigned to this timestamp
+  const amd::Command& command() const { return command_; }
+
+  //! Returns virtual GPU device, used with this timestamp
+  VirtualGPU* gpu() const { return gpu_; }
 };
 
 class VirtualGPU : public device::VirtualDevice {
@@ -382,6 +392,7 @@ class VirtualGPU : public device::VirtualDevice {
       uint32_t addSystemScope_     : 1; //!< Insert a system scope to the next aql
       uint32_t isLastCommandSDMA_  : 1; //!< Keep track if the last command was SDMA and
                                         //!< not send Barrier packets if barrier_sync is 0
+      uint32_t tracking_created_   : 1; //!< Enabled if tracking object was properly initialized
     };
     uint32_t  state_;
   };
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 86444ed717..8bd79a8cc1 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -243,14 +243,14 @@ Command* Event::notifyCmdQueue(bool retain) {
 
 const Event::EventWaitList Event::nullWaitList(0);
 
+// ================================================================================================
 Command::Command(HostQueue& queue, cl_command_type type,
                  const EventWaitList& eventWaitList, uint32_t commandWaitBits)
     : Event(queue),
       queue_(&queue),
-      next_(NULL),
+      next_(nullptr),
       type_(type),
-      exception_(0),
-      data_(NULL),
+      data_(nullptr),
       eventWaitList_(eventWaitList),
       commandWaitBits_(commandWaitBits) {
   // Retain the commands from the event wait list.
@@ -258,6 +258,7 @@ Command::Command(HostQueue& queue, cl_command_type type,
   if (type != 0) activity_.Initialize(type, queue.vdev()->index(), queue.device().index());
 }
 
+// ================================================================================================
 void Command::releaseResources() {
   const Command::EventWaitList& events = eventWaitList();
 
@@ -265,6 +266,7 @@ void Command::releaseResources() {
   std::for_each(events.begin(), events.end(), std::mem_fun(&Command::release));
 }
 
+// ================================================================================================
 void Command::enqueue() {
   assert(queue_ != NULL && "Cannot be enqueued");
 
@@ -277,16 +279,28 @@ void Command::enqueue() {
   // Direct dispatch logic below will submit the command immediately, but the command status
   // update will occur later after flush() with a wait
   if (AMD_DIRECT_DISPATCH) {
+    setStatus(CL_QUEUED);
+    // The wait should be performed before the lock,
+    // otherwise signal handler may have a deadlock, but awaitCompletion() is thread safe itself
+    std::for_each(eventWaitList().begin(), eventWaitList().end(),
+        std::mem_fun(&Command::awaitCompletion));
+
     // The batch update must be lock protected to avoid a race condition
     // when multiple threads submit/flush/update the batch at the same time
-    ScopedLock sl(queue_->lock());
+    ScopedLock sl(queue_->vdev()->execution());
     queue_->FormSubmissionBatch(this);
     if ((type() == CL_COMMAND_MARKER || type() == 0) && !profilingInfo().marker_ts_) {
-      // Flush the current batch and wait for the results, since it's a marker.
-      // @todo: The condition requires a reevaluation to determine if any marker needs a wait.
+      // The current HSA signal tracking logic requires profiling enabled for the markers
+      EnableProfiling();
+      // Update batch head for the current marker. Hence the status of all commands can be
+      // updated upon the marker completion
+      SetBatchHead(queue_->GetSubmittionBatch());
+      // Flush the current batch, but skip the wait on CPU if possible to avoid a stall
       queue_->vdev()->flush(queue_->GetSubmittionBatch());
+      // The batch will be tracked with the marker now
       queue_->ResetSubmissionBatch();
     } else {
+      setStatus(CL_SUBMITTED);
       submit(*queue_->vdev());
     }
   } else {
@@ -299,6 +313,7 @@ void Command::enqueue() {
   }
 }
 
+// ================================================================================================
 const Context& Command::context() const { return queue_->context(); }
 
 NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index 23d23804c7..8e6a78ce96 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -127,7 +127,6 @@ class Event : public RuntimeObject {
       clear();
       callback_ = callback;
     }
-
   } profilingInfo_;
 
   activity_prof::ActivityProf activity_;  //!< Activity profiling
@@ -154,6 +153,13 @@ class Event : public RuntimeObject {
 
   void waitForCompletion();
 
+  //! Enable profiling for this command
+  void EnableProfiling() {
+    profilingInfo_.enabled_ = true;
+    profilingInfo_.clear();
+    profilingInfo_.callback_ = nullptr;
+  }
+
  public:
   //! Return the context for this event.
   virtual const Context& context() const = 0;
@@ -207,17 +213,13 @@ class Event : public RuntimeObject {
  *  submitted to a HostQueue for execution. Classes derived from
  *  %Command must implement the submit() function.
  *
-
  */
 class Command : public Event {
  private:
-  //! The command queue this command is enqueue into. NULL if not yet enqueue.
-  HostQueue* queue_;
-  //! Next GPU command in the queue list
-  Command* next_;
-
-  cl_command_type type_;        //!< This command's OpenCL type.
-  volatile int32_t exception_;  //!< The first raised exception.
+  HostQueue* queue_;              //!< The command queue this command is enqueue into
+  Command* next_;                 //!< Next GPU command in the queue list
+  Command* batch_head_ = nullptr; //!< The head of the batch commands
+  cl_command_type     type_;      //!< This command's OpenCL type.
   void* data_;
 
  protected:
@@ -235,11 +237,10 @@ class Command : public Event {
   //! Construct a new command of the given OpenCL type.
   Command(cl_command_type type)
       : Event(),
-        queue_(NULL),
-        next_(NULL),
+        queue_(nullptr),
+        next_(nullptr),
         type_(type),
-        exception_(0),
-        data_(NULL),
+        data_(nullptr),
         eventWaitList_(nullWaitList),
         commandWaitBits_(0) {}
 
@@ -267,12 +268,6 @@ class Command : public Event {
   //! Return this command's OpenCL type.
   cl_command_type type() const { return type_; }
 
-  //! Return the first raised exception or 0 if none.
-  int32_t exception() const { return exception_; }
-
-  //! Set the exception for this command.
-  void setException(int32_t exception) { exception_ = exception; }
-
   //! Return the opaque, device specific data for this command.
   void* data() const { return data_; }
 
@@ -303,6 +298,12 @@ class Command : public Event {
   uint32_t getWaitBits() const { return commandWaitBits_; }
 
   void OverrrideCommandType(cl_command_type type) { type_ = type; }
+
+  //! Updates the batch head, associated with this command(marker)
+  void SetBatchHead(Command* command) { batch_head_ = command; }
+
+  //! Returns the current batch head
+  Command* GetBatchHead() const { return batch_head_; }
 };
 
 class UserEvent : public Command {
diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp
index 4af38dd53b..2031ad44e8 100644
--- a/rocclr/platform/commandqueue.cpp
+++ b/rocclr/platform/commandqueue.cpp
@@ -227,7 +227,7 @@ Command* HostQueue::getLastQueuedCommand(bool retain) {
   if (AMD_DIRECT_DISPATCH) {
     // The batch update must be lock protected to avoid a race condition
     // when multiple threads submit/flush/update the batch at the same time
-    ScopedLock sl(lock());
+    ScopedLock sl(vdev()->execution());
 
     // Since the lastCmdLock_ is acquired, it is safe to read and retain the lastEnqueueCommand.
     // It is guaranteed that the pointer will not change.