From 40f41f4d0bb778bce6edce634164c2614d979465 Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Thu, 26 Oct 2023 20:06:18 +0000
Subject: [PATCH] SWDEV-422207 - Track commands for capture

- Track all captured commands under a new AccumulateCommand
- Add begin() and end() methods to capture commands
- Explicit TS object now passed to certain methods because
profilingBegin() and profilingEnd() now happen separately and thus can
run into threading issues

Change-Id: I171106bdcad72b057836cb2f3fc398db3533119f
---
 hipamd/src/hip_graph_internal.cpp | 16 +++---
 rocclr/device/device.hpp          |  6 ++-
 rocclr/device/pal/palvirtual.cpp  |  8 ++-
 rocclr/device/pal/palvirtual.hpp  |  4 +-
 rocclr/device/rocm/rocvirtual.cpp | 89 ++++++++++++++++++++-----------
 rocclr/device/rocm/rocvirtual.hpp | 10 ++--
 rocclr/platform/command.cpp       |  2 +-
 rocclr/platform/command.hpp       | 31 ++++++++---
 8 files changed, 110 insertions(+), 56 deletions(-)

diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp
index 884b4cdf6b..631e54ec80 100644
--- a/hipamd/src/hip_graph_internal.cpp
+++ b/hipamd/src/hip_graph_internal.cpp
@@ -529,9 +529,14 @@ hipError_t GraphExec::Run(hipStream_t stream) {
   }
 
   if (parallelLists_.size() == 1) {
+    amd::AccumulateCommand* accumulate = nullptr;
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+      accumulate = new amd::AccumulateCommand(*hip_stream);
+    }
+
     for (int i = 0; i < topoOrder_.size(); i++) {
       if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
-        hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket());
+        hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
       } else {
         topoOrder_[i]->SetStream(hip_stream, this);
         status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -540,13 +545,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
     }
 
     if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-      amd::Command* endCommand = nullptr;
-      endCommand = new amd::Marker(*hip_stream, false);
-      // Since the end command is for graph completion tracking,
-      // it may not need release scopes
-      endCommand->setEventScope(amd::Device::kCacheStateIgnore);
-      endCommand->enqueue();
-      endCommand->release();
+      accumulate->enqueue();
+      accumulate->release();
     }
   } else {
     UpdateStream(parallelLists_, hip_stream, this);
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index c1459d24cf..c3662a9982 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -79,6 +79,7 @@ class PerfCounterCommand;
 class ReleaseObjectCommand;
 class StallQueueCommand;
 class Marker;
+class AccumulateCommand;
 class ThreadTraceCommand;
 class ThreadTraceMemObjectsCommand;
 class SignalCommand;
@@ -1248,6 +1249,7 @@ class VirtualDevice : public amd::HeapObject {
   virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0;
   virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0;
   virtual void submitMarker(amd::Marker& cmd) = 0;
+  virtual void submitAccumulate(amd::AccumulateCommand& cmd) = 0;
   virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) = 0;
   virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0;
   virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0;
@@ -1287,7 +1289,9 @@ class VirtualDevice : public amd::HeapObject {
 
   //! Returns fence state of the VirtualGPU
   virtual bool isFenceDirty() const = 0;
-  virtual bool dispatchAqlPacket(uint8_t* aqlpacket) = 0;
+
+  //! Dispatch captured AQL packet
+  virtual bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) = 0;
 
   //! Resets fence state of the VirtualGPU
   virtual void resetFenceDirty() = 0;
diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp
index 7a095f0117..96a199d9eb 100644
--- a/rocclr/device/pal/palvirtual.cpp
+++ b/rocclr/device/pal/palvirtual.cpp
@@ -2709,6 +2709,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
   Unimplemented();  //!< @todo: Unimplemented
 }
 
+// ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
   //!@note runtime doesn't need to lock this command on execution
 
@@ -2735,6 +2736,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
   }
 }
 
+// ================================================================================================
+void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
+}
+
+// ================================================================================================
 void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
 
   const Pal::IQueueSemaphore* sem = reinterpret_cast<const Pal::IQueueSemaphore*>(cmd.sem_ptr());
@@ -2748,10 +2754,8 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
     queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
                                                        cmd.fence());
   }
-
 }
 
-
 void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
   queues_[MainEngine]->removeCmdMemRef(mem);
   if (!dev().settings().disableSdma_) {
diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp
index dbed5809db..808e6c2616 100644
--- a/rocclr/device/pal/palvirtual.hpp
+++ b/rocclr/device/pal/palvirtual.hpp
@@ -318,6 +318,7 @@ class VirtualGPU : public device::VirtualDevice {
   void submitFillMemory(amd::FillMemoryCommand& vcmd);
   void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
   void submitMarker(amd::Marker& vcmd);
+  void submitAccumulate(amd::AccumulateCommand& vcmd);
   void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
   void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
   void submitPerfCounter(amd::PerfCounterCommand& vcmd);
@@ -342,7 +343,8 @@ class VirtualGPU : public device::VirtualDevice {
 
   bool isFenceDirty() const { return false; }
 
-  inline bool dispatchAqlPacket(uint8_t* aqlpacket) { return false; }
+  inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) {
+     return false; }
 
   void resetFenceDirty() {}
 
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index e8a75e9df4..16706003f8 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013 - 2022 Advanced Micro Devices, Inc.
+/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -127,7 +127,6 @@ void Timestamp::checkGpuTime() {
 
     for (auto it : signals_) {
       amd::ScopedLock lock(it->LockSignalOps());
-
       // Ignore the wait if runtime processes API callback, because the signal value is bigger
       // than expected and the value reset will occur after API callback is done
       if (GetCallbackSignal().handle == 0) {
@@ -149,7 +148,8 @@ void Timestamp::checkGpuTime() {
         start = std::min(time.start, start);
         end = std::max(time.end, end);
         ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
-          "end = %ld time taken= %ld ns", it->signal_.handle, start, end, end - start);
+          "end = %ld time taken= %ld ns", it->signal_.handle, time.start, time.end,
+          time.end - time.start);
       }
       it->flags_.done_ = true;
     }
@@ -848,6 +848,13 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   if (timestamp_ != nullptr) {
     // Get active signal for current dispatch if profiling is necessary
     packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
+
+    // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
+    // retrieve this correlation ID to attribute waves to specific dispatch locations.
+    if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value) {
+      auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
+      dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
+    }
   }
 
   // Make sure the slot is free for usage
@@ -954,6 +961,24 @@ bool VirtualGPU::dispatchAqlPacket(
   return dispatchGenericAqlPacket(packet, header, rest, blocking);
 }
 
+// ================================================================================================
+inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
+  amd::ScopedLock lock(execution());
+  if (vcmd != nullptr) {
+    profilingBegin(*vcmd, true, true);
+  }
+  dispatchBlockingWait();
+  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
+
+  constexpr size_t kPacketSize = 1;
+  Timestamp* ts = reinterpret_cast<Timestamp*>(vcmd->data());
+  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
+  if (vcmd != nullptr) {
+    profilingEnd(*vcmd, true);
+  }
+  return true;
+}
+
 // ================================================================================================
 bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
                                           const uint32_t gfxVersion, bool blocking,
@@ -1056,21 +1081,6 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
   barrier_packet_.dep_signal[4] = hsa_signal_t{};
 }
 
-inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket) {
-  dispatchBlockingWait();
-  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
-  // If rocprof tracing is enabled, store the correlation ID in the dispatch packet.
-  // The profiler can retrieve this correlation ID to attribute waves to specific dispatch
-  // locations.
-  if (activity_prof::IsEnabled(OP_ID_DISPATCH) || profiling_) {
-    packet->reserved2 = activity_prof::correlation_id;
-    // Get active signal for current dispatch if profiling is necessary
-    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
-  }
-  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
-  return true;
-}
-
 // ================================================================================================
 void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveDepSignal,
                                             hsa_signal_t signal, hsa_signal_value_t value,
@@ -1430,17 +1440,23 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
-void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
+void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
   if (command.profilingInfo().enabled_) {
     if (timestamp_ != nullptr) {
       LogWarning("Trying to create a second timestamp in VirtualGPU. \
                   This could have unintended consequences.");
       return;
     }
-    // Without barrier profiling will wait for each individual signal
-    timestamp_ = new Timestamp(this, command);
-    command.setData(timestamp_);
-    timestamp_->start();
+    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
+
+    if (ts == nullptr) {
+      // Without barrier profiling will wait for each individual signal
+      timestamp_ = new Timestamp(this, command);
+      command.setData(timestamp_);
+      timestamp_->start();
+    } else {
+      timestamp_ = ts;
+    }
 
     // Enable SDMA profiling on the first access if profiling is set
     // Its not per command basis
@@ -1473,10 +1489,11 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command) {
+void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
   if (command.profilingInfo().enabled_) {
-    if (timestamp_->HwProfiling() == false) {
-      timestamp_->end();
+    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
+    if (ts->HwProfiling() == false) {
+      ts->end();
     }
     timestamp_ = nullptr;
   }
@@ -3238,11 +3255,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       addSystemScope_ = true;
     }
 
-    // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
-    // retrieve this correlation ID to attribute waves to specific dispatch locations.
-    if (vcmd != nullptr && vcmd->profilingInfo().enabled_) {
-      dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
-    }
 
     // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
     if (aql_packet != nullptr) {
@@ -3374,7 +3386,6 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 
 // ================================================================================================
 void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
-  // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
 }
 
 // ================================================================================================
@@ -3413,6 +3424,20 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
   }
 }
 
+// ================================================================================================
+void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
+  // Make sure VirtualGPU has an exclusive access to the resources
+  amd::ScopedLock lock(execution());
+  profilingBegin(vcmd, true, true);
+  const Settings& settings = dev().settings();
+  if (settings.barrier_value_packet_) {
+    dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
+  } else {
+    dispatchBarrierPacket(kNopPacketHeader, false);
+  }
+  profilingEnd(vcmd, true);
+}
+
 // ================================================================================================
 void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 98071de0a0..bfae393726 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
+/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -299,8 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
   bool create();
   const Device& dev() const { return roc_device_; }
 
-  void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
-  void profilingEnd(amd::Command& command);
+  void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
+  void profilingEnd(amd::Command& command, bool useCommandTs = false);
 
   void updateCommandsState(amd::Command* list) const;
 
@@ -321,7 +321,7 @@ class VirtualGPU : public device::VirtualDevice {
                             );
   void submitNativeFn(amd::NativeFnCommand& cmd);
   void submitMarker(amd::Marker& cmd);
-
+  void submitAccumulate(amd::AccumulateCommand& cmd);
   void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
   void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
   void submitPerfCounter(amd::PerfCounterCommand& cmd);
@@ -416,7 +416,7 @@ class VirtualGPU : public device::VirtualDevice {
   //! Dispatches a barrier with blocking HSA signals
   void dispatchBlockingWait();
 
-  inline bool dispatchAqlPacket(uint8_t* aqlpacket);
+  inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr);
   bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
                          bool blocking = true, bool capturing = false,
                          const uint8_t* aqlPacket = nullptr);
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index c455acd58c..0810d998e0 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -362,7 +362,7 @@ void Command::enqueue() {
     ScopedLock sl(queue_->vdev()->execution());
     queue_->FormSubmissionBatch(this);
 
-    if (type() == CL_COMMAND_MARKER || type() == 0) {
+    if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
       // The current HSA signal tracking logic requires profiling enabled for the markers
       EnableProfiling();
       // Update batch head for the current marker. Hence the status of all commands can be
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index 6861546951..ece4a21a0b 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -269,7 +269,8 @@ class Command : public Event {
   uint32_t commandWaitBits_;
 
   //! Construct a new command of the given OpenCL type.
-  Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList,
+  Command(HostQueue& queue, cl_command_type type,
+          const EventWaitList& eventWaitList = nullWaitList,
           uint32_t commandWaitBits = 0, const Event* waitingEvent = nullptr);
 
   //! Construct a new command of the given OpenCL type.
@@ -857,7 +858,7 @@ class CopyMemoryCommand : public TwoMemoryArgsCommand {
       : TwoMemoryArgsCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory),
         srcOrigin_(srcOrigin),
         dstOrigin_(dstOrigin),
-        size_(size), 
+        size_(size),
         copyMetadata_(copyMetadata){
     // Sanity checks
     assert(size.c[0] > 0 && "invalid");
@@ -1222,7 +1223,8 @@ class ExternalSemaphoreCmd : public Command {
  public:
   ExternalSemaphoreCmd(HostQueue& queue, const void* sem_ptr, uint64_t fence,
                        ExternalSemaphoreCmdType cmd_type)
-      : Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence), cmd_type_(cmd_type) {}
+      : Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence),
+                         cmd_type_(cmd_type) {}
 
   virtual void submit(device::VirtualDevice& device) {
     device.submitExternalSemaphoreCmd(*this);
@@ -1239,12 +1241,28 @@ class Marker : public Command {
   //! Create a new Marker
   Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList,
          const Event* waitingEvent = nullptr, bool cpu_wait = false)
-      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) { cpu_wait_ = cpu_wait; }
+      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent)
+    {
+      cpu_wait_ = cpu_wait;
+    }
 
   //! The actual command implementation.
   virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); }
 };
 
+class AccumulateCommand : public Command {
+ public:
+  //! Create a new Marker
+  AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
+         const Event* waitingEvent = nullptr)
+      : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {}
+
+  //! The command implementation
+  virtual void submit(device::VirtualDevice& device) {
+    device.submitAccumulate(*this);
+  }
+};
+
 /*! \brief  Maps CL objects created from external ones and syncs the contents (blocking).
  *
  */
@@ -1516,8 +1534,9 @@ class SvmFreeMemoryCommand : public Command {
   void* userData_;                  //!< Data passed to user-defined callback
 
  public:
-  SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, uint32_t numSvmPointers,
-                       void** svmPointers, freeCallBack pfnFreeFunc, void* userData)
+  SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList,
+                       uint32_t numSvmPointers, void** svmPointers,
+                       freeCallBack pfnFreeFunc, void* userData)
       : Command(queue, CL_COMMAND_SVM_FREE, eventWaitList),
         //! We copy svmPointers since it can be reused/deallocated after
         //  command creation