SWDEV-422207 - Optimize graph end detection

- Do not use extra barrier to detect graph end. If its a kernel node we can use a completion signal for the last packet. Saves roughly 6us for Phantom testcase per graph launch. Change-Id: I5e0c2479d9964fbeda86ed97533f6718f49a7f91
2023-11-09 23:52:40 +00:00
@@ -541,11 +541,14 @@ hipError_t GraphExec::Run(hipStream_t stream) {

  if (parallelLists_.size() == 1) {
    amd::AccumulateCommand* accumulate = nullptr;
+    bool isLastPacketKernel = false;
    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-      accumulate = new amd::AccumulateCommand(*hip_stream);
+      uint8_t* lastCapturedPacket = (topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) ?
+                                  topoOrder_.back()->GetAqlPacket() : nullptr;
+      accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket);
    }

-    for (int i = 0; i < topoOrder_.size(); i++) {
+    for (int i = 0; i < topoOrder_.size() - 1; i++) {
      if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
        hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
      } else {
@@ -555,7 +558,20 @@ hipError_t GraphExec::Run(hipStream_t stream) {
      }
    }

-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+    // If last captured packet is kernel, optimize to detect completion of last kernel
+    // This saves on extra packet submitted to determine end of graph
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
+      accumulate->enqueue();
+      accumulate->release();
+      isLastPacketKernel = true;
+    } else {
+      topoOrder_.back()->SetStream(hip_stream, this);
+      status = topoOrder_.back()->CreateCommand(topoOrder_.back()->GetQueue());
+      topoOrder_.back()->EnqueueCommands(stream);
+    }
+
+    // If last packet is not kernel, submit a marker to detect end of graph
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && !isLastPacketKernel) {
      accumulate->enqueue();
      accumulate->release();
    }
@@ -974,10 +974,8 @@ inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCom
    profilingBegin(*vcmd, true, true);
  }
  dispatchBlockingWait();
-  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
-
  constexpr size_t kPacketSize = 1;
-  Timestamp* ts = reinterpret_cast<Timestamp*>(vcmd->data());
+  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
  if (vcmd != nullptr) {
    profilingEnd(*vcmd, true);
@@ -3435,12 +3433,22 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd, true, true);
-  const Settings& settings = dev().settings();
-  if (settings.barrier_value_packet_) {
-    dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
+
+  uint8_t* aqlPacket = vcmd.getLastPacket();
+  if (aqlPacket != nullptr) {
+    dispatchBlockingWait();
+    constexpr size_t kPacketSize = 1;
+    auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
+    dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
  } else {
-    dispatchBarrierPacket(kNopPacketHeader, false);
+    const Settings& settings = dev().settings();
+    if (settings.barrier_value_packet_) {
+      dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
+    } else {
+      dispatchBarrierPacket(kNopPacketHeader, false);
+    }
  }
+
  profilingEnd(vcmd, true);
 }

@@ -1261,14 +1261,19 @@ class Marker : public Command {
 };

 class AccumulateCommand : public Command {
+ private:
+  uint8_t* lastPacket_;
 public:
  //! Create a new Marker
  AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
-         const Event* waitingEvent = nullptr)
-      : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {
+         const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr)
+      : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent),
+        lastPacket_(lastPacket)
+      {
        profilingInfo_.multiple_ts_ = true;
      }
-
+  // Return last packet
+  uint8_t* getLastPacket() const { return lastPacket_; }
  //! The command implementation
  virtual void submit(device::VirtualDevice& device) {
    device.submitAccumulate(*this);