SWDEV-272496 - Fix multiple timing issues

- Don't notify if the batch is empty, because that means the current command was processed already. - Disable pinning optimization to avoid a race condition on stall. - TS marker submition requires extra AQL barrier to track the status. Change-Id: I17eff4ad12ac66cfe1bb44048bebb1891805279d [ROCm/clr commit: 24299e25bd]
2021-02-26 16:17:30 -05:00
parent d82e14bf2c
commit f6cc68deac
@@ -750,7 +750,7 @@ bool Buffer::create() {
                    "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed");
            return false;
          }
-          volatile hsa_signal_value_t* signalValuePtr;
+          volatile hsa_signal_value_t* signalValuePtr = nullptr;
          if (HSA_STATUS_SUCCESS != hsa_amd_signal_value_pointer(signal_, &signalValuePtr)) {
            ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
                    "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY pointer query failed");
@@ -146,15 +146,13 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
      ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) {
    return false;
  }
+  amd::ScopedLock sl(ts->gpu()->execution());
+  ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n",
+    static_cast<uint32_t>(value), arg, ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0);
+
  // Update the batch, since signal is complete
-  if (ts->Signals().size() > 0) {
-    amd::ScopedLock sl(ts->gpu()->execution());
-    ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n",
-        static_cast<uint32_t>(value), arg, ts->Signals()[0]->signal_.handle);
-    ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
-  } else {
-    LogError("Error: ROCr handler was called for untracked signal!");
-  }
+  ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
+
  // Return false, so the callback will not be called again for this signal
  return false;
 }
@@ -331,6 +329,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
  prof_signal->done_ = false;
  prof_signal->engine_ = engine_;
  if (ts != 0) {
+    // Save HSA signal earlier to make sure the possible callback will have a valid
+    // value for processing
+    prof_signal->ts_ = ts;
+    ts->AddProfilingSignal(prof_signal);
    // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
    // requires the batch update upon HSA signal completion
    if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
@@ -347,8 +349,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
      hsa_amd_profiling_async_copy_enable(true);
      sdma_profiling_ = true;
    }
-    prof_signal->ts_ = ts;
-    ts->AddProfilingSignal(prof_signal);
  }
  return prof_signal->signal_;
 }
@@ -1127,7 +1127,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
 }

 // ================================================================================================
-void VirtualGPU::updateCommandsState(amd::Command* list) {
+void VirtualGPU::updateCommandsState(amd::Command* list) const {
  Timestamp* ts = nullptr;

  amd::Command* current = list;
@@ -2802,6 +2802,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
  // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
 }

+// ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  if (vcmd.profilingInfo().marker_ts_) {
    profilingBegin(vcmd);
@@ -2811,11 +2812,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
      // for cache flushes explicitly and helps wall time
      uint16_t header = kNopPacketHeader;
      dispatchBarrierPacket(&barrier_packet_, header);
+      // Direct dispatch requires a barrier with callback and hasPendingDispatch_ triggers that
+      if (AMD_DIRECT_DISPATCH) {
+        hasPendingDispatch_ = true;
+      }
    }
    profilingEnd(vcmd);
  }
 }

+// ================================================================================================
 void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
@@ -2856,8 +2862,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
    // If runtime didn't submit a barrier, then it can't track the completion of the batch.
    // Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
    // Due to performance impact of extra barriers CPU wait is selected.
-    // CPU wait is also forced if pinned buffers were used
-    skip_cpu_wait &= hasPendingDispatch_ && (pinnedMems_.size() == 0);
+    // Note: if callback will be selected to update the batch status,
+    // then the host thread can't update it also, otherwise double free may occur
+    skip_cpu_wait &= hasPendingDispatch_;

    releaseGpuMemoryFence(force_barrier, skip_cpu_wait);
    profilingEnd(*current);
@@ -2910,14 +2917,18 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
  //!        unconditionally, before it can release pinned memory
  releaseGpuMemoryFence();
-  if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
-    if (pinnedMems_.size() > 7) {
-      pinnedMems_.front()->release();
-      pinnedMems_.erase(pinnedMems_.begin());
-    }
+  if (!AMD_DIRECT_DISPATCH) {
+    if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
+      if (pinnedMems_.size() > 7) {
+        pinnedMems_.front()->release();
+        pinnedMems_.erase(pinnedMems_.begin());
+      }

-    // Delay destruction
-    pinnedMems_.push_back(mem);
+      // Delay destruction
+      pinnedMems_.push_back(mem);
+    }
+  } else {
+    mem->release();
  }
 }

@@ -36,7 +36,6 @@ class Device;
 class Memory;
 class Timestamp;

-
 struct ProfilingSignal : public amd::HeapObject {
  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
  Timestamp*    ts_;      //!< Timestamp object associated with the signal
@@ -98,7 +97,7 @@ class Timestamp : public amd::HeapObject {
    , gpu_(gpu)
    , command_(command) {}

-  virtual ~Timestamp() {}
+  ~Timestamp() {}

  uint64_t getStart() {
    checkGpuTime();
@@ -236,7 +235,7 @@ class VirtualGPU : public device::VirtualDevice {
  void profilingBegin(amd::Command& command, bool drmProfiling = false);
  void profilingEnd(amd::Command& command);

-  void updateCommandsState(amd::Command* list);
+  void updateCommandsState(amd::Command* list) const;

  void submitReadMemory(amd::ReadMemoryCommand& cmd);
  void submitWriteMemory(amd::WriteMemoryCommand& cmd);
@@ -213,13 +213,17 @@ bool Event::awaitCompletion() {
 // ================================================================================================
 bool Event::notifyCmdQueue() {
  HostQueue* queue = command().queue();
-  if ((status() > CL_COMPLETE) &&
-      // Don't need to notify any marker with direct dispatch,
-      // because all markers are blocking
+  if ((status() > CL_COMPLETE) && (nullptr != queue) &&
      (!AMD_DIRECT_DISPATCH ||
+       // Don't need to notify any marker with direct dispatch,
+       // because all markers are blocking.
       ((command().type() != CL_COMMAND_MARKER) &&
-        (command().type() != 0))) &&
-      (nullptr != queue) && !notified_.test_and_set()) {
+        (command().type() != 0)) ||
+        // Don't need to notify if the current batch is empty,
+        // because that means the command was processed and extra notification
+        // will cause a stall on the host.
+        (queue->GetSubmittionBatch() != nullptr)) &&
+        !notified_.test_and_set()) {
    // Make sure the queue is draining the enqueued commands.
    amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
    if (command == NULL) {