SWDEV-301667 - Add cache state for a device

- Add a global cache state for a device to indicate scopes of submitted AQL packets - Remove scopes for TS marker if hipEventReleaseToDevice is passed. Set env ROC_EVENT_NO_FLUSH=1 to use NOP AQL for event records. It would flush caches by default with system scope release. - Calling finish() should ensure if caches are flushed, if not queue a marker Change-Id: Ibbbdbb1cd7ac61cb35649169212142545be159e0 [ROCm/clr commit: 8eeaa998c0]
2022-03-29 22:48:19 -07:00
@@ -1527,6 +1527,13 @@ class Device : public RuntimeObject {
    kKernArg = 2
  } MemorySegment;

+  typedef enum CacheState {
+    kCacheStateInvalid = -1,
+    kCacheStateIgnore = 0,
+    kCacheStateAgent = 1,
+    kCacheStateSystem = 2
+  } CacheState;
+
  typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;

  static constexpr size_t kP2PStagingSize = 4 * Mi;
@@ -1749,6 +1756,11 @@ class Device : public RuntimeObject {
    return false;
  };

+  // Returns bool value if the device cache is equal to the parameter
+  virtual bool IsCacheFlushed(CacheState state) const {
+    return false;
+  };
+
  virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};

  virtual const uint32_t getPreferredNumaNode() const { return 0; }
@@ -180,6 +180,7 @@ Device::Device(hsa_agent_t bkendDevice)
  gpuvm_segment_.handle = 0;
  gpu_fine_grained_segment_.handle = 0;
  prefetch_signal_.handle = 0;
+  cache_state_ = Device::CacheState::kCacheStateInvalid;
 }

 void Device::setupCpuAgent() {
@@ -2635,6 +2636,17 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
                    start, end);
  }
 }
+
+// ================================================================================================
+bool Device::IsCacheFlushed(Device::CacheState state) const {
+  return (static_cast<int>(state) == cache_state_.load(std::memory_order_relaxed));
+}
+
+// ================================================================================================
+void Device::SetCacheState(Device::CacheState state) {
+  cache_state_.store(static_cast<int>(state), std::memory_order_relaxed);
+}
+
 // ================================================================================================
 static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
  if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
@@ -260,6 +260,8 @@ class NullDevice : public amd::Device {

  virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
  virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
+  virtual bool IsCacheFlushed(Device::CacheState state) const { return false; };
+  virtual void SetCacheState(Device::CacheState state) {};
  virtual void ReleaseGlobalSignal(void* signal) const {}

 #if defined(__clang__)
@@ -440,6 +442,8 @@ class Device : public NullDevice {

  virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
  virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
+  virtual bool IsCacheFlushed(Device::CacheState state) const;
+  virtual void SetCacheState(Device::CacheState state);
  virtual void ReleaseGlobalSignal(void* signal) const;

  //! Allocate host memory in terms of numa policy set by user
@@ -583,6 +587,7 @@ class Device : public NullDevice {
  hsa_amd_memory_pool_t gpuvm_segment_;
  hsa_amd_memory_pool_t gpu_fine_grained_segment_;
  hsa_signal_t prefetch_signal_;    //!< Prefetch signal, used to explicitly prefetch SVM on device
+  std::atomic<int> cache_state_;    //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem

  size_t gpuvm_segment_max_alloc_;
  size_t alloc_granularity_;
@@ -77,6 +77,11 @@ static constexpr uint16_t kBarrierPacketHeader =
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

+static constexpr uint16_t kBarrierPacketAgentScopeHeader =
+    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
 static constexpr uint16_t kNopPacketHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
@@ -126,7 +131,7 @@ void Timestamp::checkGpuTime() {
      }
      // Avoid profiling data for the sync barrier, in tiny performance tests the first call
      // to ROCr is very slow and that also affects the overall performance of the callback thread
-      if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_) {
+      if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ > 0) {
        hsa_amd_profiling_dispatch_time_t time = {};
        if (it->engine_ == HwQueueEngine::Compute) {
          hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
@@ -442,7 +447,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
        // Update the current command/marker with HW event
        prof_signal->retain();
        ts->command().SetHwEvent(prof_signal);
-      } else if (ts->command().profilingInfo().marker_ts_ ) {
+      } else if (ts->command().profilingInfo().marker_ts_ > 0 ) {
        // Update the current command/marker with HW event
        prof_signal->retain();
        ts->command().SetHwEvent(prof_signal);
@@ -803,11 +808,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
  uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);

+  auto cache_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
  if (timestamp_ != nullptr) {
    // Pool size must grow to the size of pending AQL packets
    const uint32_t pool_size = index - read;
    // Get active signal for current dispatch if profiling is necessary
-    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
+                                                        pool_size);
  }

  // Make sure the slot is free for usage
@@ -845,8 +854,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
                           HSA_PACKET_HEADER_WIDTH_BARRIER),
            extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                           HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-            extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                           HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+            cache_state,
            rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
@@ -863,6 +871,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  //hsa_queue_store_write_index_release(gpu_queue_, index);
  hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1);

+  roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
+
  // Wait on signal ?
  if (blocking) {
    LogInfo("Runtime reachead the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
@@ -957,6 +967,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
  uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);

+  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
  if (!skipSignal) {
    // Pool size must grow to the size of pending AQL packets
    const uint32_t pool_size = index - read;
@@ -987,11 +999,13 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
                         HSA_PACKET_HEADER_WIDTH_BARRIER),
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-          extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+          cache_state,
          barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1],
          barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3],
          barrier_packet_.dep_signal[4], barrier_packet_.completion_signal);
+
+  roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
+
  // Clear dependent signals for the next packet
  barrier_packet_.dep_signal[0] = hsa_signal_t{};
  barrier_packet_.dep_signal[1] = hsa_signal_t{};
@@ -2303,7 +2317,12 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
  unsigned int* headerPtr = reinterpret_cast<unsigned int*>(&header);
  __atomic_store_n(reinterpret_cast<uint32_t*>(aql_loc), *headerPtr, __ATOMIC_RELEASE);

+  auto cache_state = extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
  hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
+  roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
+
  ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
          "HWq=0x%zx, BarrierValue Header = 0x%x AmdFormat = 0x%x ",
          "(type=%d, barrier=%d, acquire=%d, release=%d), "
@@ -2313,8 +2332,7 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
          extractAqlBits(header.header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
          extractAqlBits(header.header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-          extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+          cache_state,
          packet->completion_signal, packet->value, packet->mask, packet->cond,
          HSA_SIGNAL_CONDITION_GTE, HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_NE);
 }
@@ -3096,10 +3114,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
    } else {
      profilingBegin(vcmd);
      if (timestamp_ != nullptr) {
-        // Submit a barrier with a cache flushes.
-        dispatchBarrierPacket(kBarrierPacketHeader, false);
-
-        hasPendingDispatch_ = false;
+        uint32_t releaseFlags = vcmd.profilingInfo().marker_ts_;
+        if (ROC_EVENT_NO_FLUSH && releaseFlags == Device::CacheState::kCacheStateIgnore) {
+          dispatchBarrierPacket(kNopPacketHeader, false);
+        } else if (releaseFlags == Device::CacheState::kCacheStateAgent) {
+          dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false);
+        } else {
+          // Submit a barrier with a cache flushes.
+          dispatchBarrierPacket(kBarrierPacketHeader, false);
+          hasPendingDispatch_ = false;
+        }
      }
      profilingEnd(vcmd);
    }
@@ -55,8 +55,14 @@ Event::Event(HostQueue& queue)
 }

 // ================================================================================================
-Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
-    hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
+Event::Event()
+    : callbacks_(NULL),
+      status_(CL_SUBMITTED),
+      hw_event_(nullptr),
+      notify_event_(nullptr),
+      device_(nullptr) {
+  notified_.clear();
+}

 // ================================================================================================
 Event::~Event() {
@@ -241,7 +247,7 @@ bool Event::awaitCompletion() {
      return false;
    }

-    ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d",
+    ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d",
      this, status());
    auto* queue = command().queue();
    if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
@@ -256,7 +262,7 @@ bool Event::awaitCompletion() {
        lock_.wait();
      }
    }
-    ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
+    ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this);
  }

  return status() == CL_COMPLETE;
@@ -277,7 +283,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
        notified_.clear();
        return false;
      }
-      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
      command->enqueue();
      // Save notification, associated with the current event
      notify_event_ = command;
@@ -290,7 +296,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
        notified_.clear();
        return false;
      }
-      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
      command->enqueue();
      command->release();
    }
@@ -336,7 +342,7 @@ void Command::enqueue() {
    Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);
  }

-  ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
+  ClPrint(LOG_DEBUG, LOG_CMD, "Command enqueued: %p", this);

  // Direct dispatch logic below will submit the command immediately, but the command status
  // update will occur later after flush() with a wait
@@ -360,7 +366,7 @@ void Command::enqueue() {
      EnableProfiling();
    }

-    if (isMarker && !profilingInfo().marker_ts_) {
+    if (isMarker && (profilingInfo().marker_ts_ == 0)) {
      // Update batch head for the current marker. Hence the status of all commands can be
      // updated upon the marker completion
      SetBatchHead(queue_->GetSubmittionBatch());
@@ -414,6 +420,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
    profilingInfo_.enabled_ = true;
    profilingInfo_.clear();
    profilingInfo_.callback_ = nullptr;
+    profilingInfo_.marker_ts_ = 1;
  }
  kernel_.retain();
 }
@@ -102,7 +102,7 @@ class Event : public RuntimeObject {
  static const EventWaitList nullWaitList;

  struct ProfilingInfo {
-    ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
+    ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(0) {
      if (enabled) {
        clear();
        callback_ = nullptr;
@@ -113,10 +113,11 @@ class Event : public RuntimeObject {
    uint64_t submitted_;
    uint64_t start_;
    uint64_t end_;
-    bool enabled_;    //!< Profiling enabled for the wave limiter
-    uint32_t waves_;  //!< The number of waves used in a dispatch
+    bool enabled_;        //!< Profiling enabled for the wave limiter
+    uint32_t waves_;      //!< The number of waves used in a dispatch
    ProfilingCallback* callback_;
-    bool marker_ts_;
+    uint32_t marker_ts_;  //!< Marker with release scope
+                          //!< 5 - system scope, 3 - device scope, 1 - no scopes
    void clear() {
      queued_ = 0ULL;
      submitted_ = 0ULL;
@@ -106,20 +106,20 @@ bool HostQueue::terminate() {

 void HostQueue::finish() {
  Command* command = nullptr;
+  bool isCacheFlushed = device().IsCacheFlushed(Device::CacheState::kCacheStateSystem);
  if (IS_HIP) {
    command = getLastQueuedCommand(true);
-    // Check if the queue has nothing to process and return
-    if (AMD_DIRECT_DISPATCH &&  command == nullptr) {
+    if (AMD_DIRECT_DISPATCH && isCacheFlushed && command == nullptr) {
      return;
    }
  }
-  if (nullptr == command) {
+  if (nullptr == command || !isCacheFlushed) {
    // Send a finish to make sure we finished all commands
    command = new Marker(*this, false);
    if (command == NULL) {
      return;
    }
-    ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
+    ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued, Cache Flushed = %d", isCacheFlushed);
    command->enqueue();
  }
  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
@@ -194,7 +194,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
      continue;
    }

-    ClPrint(LOG_DEBUG, LOG_CMD, "command (%s) is submitted: %p", getOclCommandKindString(command->type()), command);
+    ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p", getOclCommandKindString(command->type()), command);

    command->setStatus(CL_SUBMITTED);

@@ -273,7 +273,9 @@ release(uint, ROC_AQL_QUEUE_SIZE, 4096,                                       \
 release(bool, ROC_SKIP_KERNEL_ARG_COPY, false,                                \
        "If true, then runtime can skip kernel arg copy")                     \
 release(bool, GPU_STREAMOPS_CP_WAIT, false,                                   \
-        "Force the stream wait memory operation to wait on CP.")
+        "Force the stream wait memory operation to wait on CP.")              \
+release(bool, ROC_EVENT_NO_FLUSH, false,                                      \
+        "Use NOP AQL packet for event records with no explicit flags.")

 namespace amd {