SWDEV-422207 - Track commands for capture

- Track all captured commands under a new AccumulateCommand - Add begin() and end() methods to capture commands - Explicit TS object now passed to certain methods because profilingBegin() and profilingEnd() now happen separately and thus can run into threading issues Change-Id: I171106bdcad72b057836cb2f3fc398db3533119f
2023-10-26 20:06:18 +00:00
@@ -529,9 +529,14 @@ hipError_t GraphExec::Run(hipStream_t stream) {
  }

  if (parallelLists_.size() == 1) {
+    amd::AccumulateCommand* accumulate = nullptr;
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+      accumulate = new amd::AccumulateCommand(*hip_stream);
+    }
+
    for (int i = 0; i < topoOrder_.size(); i++) {
      if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
-        hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket());
+        hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
      } else {
        topoOrder_[i]->SetStream(hip_stream, this);
        status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -540,13 +545,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
    }

    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-      amd::Command* endCommand = nullptr;
-      endCommand = new amd::Marker(*hip_stream, false);
-      // Since the end command is for graph completion tracking,
-      // it may not need release scopes
-      endCommand->setEventScope(amd::Device::kCacheStateIgnore);
-      endCommand->enqueue();
-      endCommand->release();
+      accumulate->enqueue();
+      accumulate->release();
    }
  } else {
    UpdateStream(parallelLists_, hip_stream, this);
@@ -79,6 +79,7 @@ class PerfCounterCommand;
 class ReleaseObjectCommand;
 class StallQueueCommand;
 class Marker;
+class AccumulateCommand;
 class ThreadTraceCommand;
 class ThreadTraceMemObjectsCommand;
 class SignalCommand;
@@ -1248,6 +1249,7 @@ class VirtualDevice : public amd::HeapObject {
  virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0;
  virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0;
  virtual void submitMarker(amd::Marker& cmd) = 0;
+  virtual void submitAccumulate(amd::AccumulateCommand& cmd) = 0;
  virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) = 0;
  virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0;
  virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0;
@@ -1287,7 +1289,9 @@ class VirtualDevice : public amd::HeapObject {

  //! Returns fence state of the VirtualGPU
  virtual bool isFenceDirty() const = 0;
-  virtual bool dispatchAqlPacket(uint8_t* aqlpacket) = 0;
+
+  //! Dispatch captured AQL packet
+  virtual bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) = 0;

  //! Resets fence state of the VirtualGPU
  virtual void resetFenceDirty() = 0;
@@ -2709,6 +2709,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
  Unimplemented();  //!< @todo: Unimplemented
 }

+// ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  //!@note runtime doesn't need to lock this command on execution

@@ -2735,6 +2736,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  }
 }

+// ================================================================================================
+void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
+}
+
+// ================================================================================================
 void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {

  const Pal::IQueueSemaphore* sem = reinterpret_cast<const Pal::IQueueSemaphore*>(cmd.sem_ptr());
@@ -2748,10 +2754,8 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
    queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
                                                       cmd.fence());
  }
-
 }

-
 void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
  queues_[MainEngine]->removeCmdMemRef(mem);
  if (!dev().settings().disableSdma_) {
@@ -318,6 +318,7 @@ class VirtualGPU : public device::VirtualDevice {
  void submitFillMemory(amd::FillMemoryCommand& vcmd);
  void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
  void submitMarker(amd::Marker& vcmd);
+  void submitAccumulate(amd::AccumulateCommand& vcmd);
  void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
  void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
  void submitPerfCounter(amd::PerfCounterCommand& vcmd);
@@ -342,7 +343,8 @@ class VirtualGPU : public device::VirtualDevice {

  bool isFenceDirty() const { return false; }

-  inline bool dispatchAqlPacket(uint8_t* aqlpacket) { return false; }
+  inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) {
+     return false; }

  void resetFenceDirty() {}

@@ -1,4 +1,4 @@
-/* Copyright (c) 2013 - 2022 Advanced Micro Devices, Inc.
+/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -127,7 +127,6 @@ void Timestamp::checkGpuTime() {

    for (auto it : signals_) {
      amd::ScopedLock lock(it->LockSignalOps());
-
      // Ignore the wait if runtime processes API callback, because the signal value is bigger
      // than expected and the value reset will occur after API callback is done
      if (GetCallbackSignal().handle == 0) {
@@ -149,7 +148,8 @@ void Timestamp::checkGpuTime() {
        start = std::min(time.start, start);
        end = std::max(time.end, end);
        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
-          "end = %ld time taken= %ld ns", it->signal_.handle, start, end, end - start);
+          "end = %ld time taken= %ld ns", it->signal_.handle, time.start, time.end,
+          time.end - time.start);
      }
      it->flags_.done_ = true;
    }
@@ -848,6 +848,13 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  if (timestamp_ != nullptr) {
    // Get active signal for current dispatch if profiling is necessary
    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
+
+    // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
+    // retrieve this correlation ID to attribute waves to specific dispatch locations.
+    if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value) {
+      auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
+      dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
+    }
  }

  // Make sure the slot is free for usage
@@ -954,6 +961,24 @@ bool VirtualGPU::dispatchAqlPacket(
  return dispatchGenericAqlPacket(packet, header, rest, blocking);
 }

+// ================================================================================================
+inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
+  amd::ScopedLock lock(execution());
+  if (vcmd != nullptr) {
+    profilingBegin(*vcmd, true, true);
+  }
+  dispatchBlockingWait();
+  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
+
+  constexpr size_t kPacketSize = 1;
+  Timestamp* ts = reinterpret_cast<Timestamp*>(vcmd->data());
+  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
+  if (vcmd != nullptr) {
+    profilingEnd(*vcmd, true);
+  }
+  return true;
+}
+
 // ================================================================================================
 bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
                                          const uint32_t gfxVersion, bool blocking,
@@ -1056,21 +1081,6 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
  barrier_packet_.dep_signal[4] = hsa_signal_t{};
 }

-inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket) {
-  dispatchBlockingWait();
-  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
-  // If rocprof tracing is enabled, store the correlation ID in the dispatch packet.
-  // The profiler can retrieve this correlation ID to attribute waves to specific dispatch
-  // locations.
-  if (activity_prof::IsEnabled(OP_ID_DISPATCH) || profiling_) {
-    packet->reserved2 = activity_prof::correlation_id;
-    // Get active signal for current dispatch if profiling is necessary
-    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
-  }
-  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
-  return true;
-}
-
 // ================================================================================================
 void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveDepSignal,
                                            hsa_signal_t signal, hsa_signal_value_t value,
@@ -1430,17 +1440,23 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
-void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
+void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
  if (command.profilingInfo().enabled_) {
    if (timestamp_ != nullptr) {
      LogWarning("Trying to create a second timestamp in VirtualGPU. \
                  This could have unintended consequences.");
      return;
    }
-    // Without barrier profiling will wait for each individual signal
-    timestamp_ = new Timestamp(this, command);
-    command.setData(timestamp_);
-    timestamp_->start();
+    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
+
+    if (ts == nullptr) {
+      // Without barrier profiling will wait for each individual signal
+      timestamp_ = new Timestamp(this, command);
+      command.setData(timestamp_);
+      timestamp_->start();
+    } else {
+      timestamp_ = ts;
+    }

    // Enable SDMA profiling on the first access if profiling is set
    // Its not per command basis
@@ -1473,10 +1489,11 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command) {
+void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
  if (command.profilingInfo().enabled_) {
-    if (timestamp_->HwProfiling() == false) {
-      timestamp_->end();
+    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
+    if (ts->HwProfiling() == false) {
+      ts->end();
    }
    timestamp_ = nullptr;
  }
@@ -3238,11 +3255,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      addSystemScope_ = true;
    }

-    // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
-    // retrieve this correlation ID to attribute waves to specific dispatch locations.
-    if (vcmd != nullptr && vcmd->profilingInfo().enabled_) {
-      dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
-    }

    // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
    if (aql_packet != nullptr) {
@@ -3374,7 +3386,6 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {

 // ================================================================================================
 void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
-  // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
 }

 // ================================================================================================
@@ -3413,6 +3424,20 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  }
 }

+// ================================================================================================
+void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
+  // Make sure VirtualGPU has an exclusive access to the resources
+  amd::ScopedLock lock(execution());
+  profilingBegin(vcmd, true, true);
+  const Settings& settings = dev().settings();
+  if (settings.barrier_value_packet_) {
+    dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
+  } else {
+    dispatchBarrierPacket(kNopPacketHeader, false);
+  }
+  profilingEnd(vcmd, true);
+}
+
 // ================================================================================================
 void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
+/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -299,8 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
  bool create();
  const Device& dev() const { return roc_device_; }

-  void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
-  void profilingEnd(amd::Command& command);
+  void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
+  void profilingEnd(amd::Command& command, bool useCommandTs = false);

  void updateCommandsState(amd::Command* list) const;

@@ -321,7 +321,7 @@ class VirtualGPU : public device::VirtualDevice {
                            );
  void submitNativeFn(amd::NativeFnCommand& cmd);
  void submitMarker(amd::Marker& cmd);
-
+  void submitAccumulate(amd::AccumulateCommand& cmd);
  void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
  void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
  void submitPerfCounter(amd::PerfCounterCommand& cmd);
@@ -416,7 +416,7 @@ class VirtualGPU : public device::VirtualDevice {
  //! Dispatches a barrier with blocking HSA signals
  void dispatchBlockingWait();

-  inline bool dispatchAqlPacket(uint8_t* aqlpacket);
+  inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr);
  bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
                         bool blocking = true, bool capturing = false,
                         const uint8_t* aqlPacket = nullptr);
@@ -362,7 +362,7 @@ void Command::enqueue() {
    ScopedLock sl(queue_->vdev()->execution());
    queue_->FormSubmissionBatch(this);

-    if (type() == CL_COMMAND_MARKER || type() == 0) {
+    if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
      // The current HSA signal tracking logic requires profiling enabled for the markers
      EnableProfiling();
      // Update batch head for the current marker. Hence the status of all commands can be
@@ -269,7 +269,8 @@ class Command : public Event {
  uint32_t commandWaitBits_;

  //! Construct a new command of the given OpenCL type.
-  Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList,
+  Command(HostQueue& queue, cl_command_type type,
+          const EventWaitList& eventWaitList = nullWaitList,
          uint32_t commandWaitBits = 0, const Event* waitingEvent = nullptr);

  //! Construct a new command of the given OpenCL type.
@@ -857,7 +858,7 @@ class CopyMemoryCommand : public TwoMemoryArgsCommand {
      : TwoMemoryArgsCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory),
        srcOrigin_(srcOrigin),
        dstOrigin_(dstOrigin),
-        size_(size), 
+        size_(size),
        copyMetadata_(copyMetadata){
    // Sanity checks
    assert(size.c[0] > 0 && "invalid");
@@ -1222,7 +1223,8 @@ class ExternalSemaphoreCmd : public Command {
 public:
  ExternalSemaphoreCmd(HostQueue& queue, const void* sem_ptr, uint64_t fence,
                       ExternalSemaphoreCmdType cmd_type)
-      : Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence), cmd_type_(cmd_type) {}
+      : Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence),
+                         cmd_type_(cmd_type) {}

  virtual void submit(device::VirtualDevice& device) {
    device.submitExternalSemaphoreCmd(*this);
@@ -1239,12 +1241,28 @@ class Marker : public Command {
  //! Create a new Marker
  Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList,
         const Event* waitingEvent = nullptr, bool cpu_wait = false)
-      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) { cpu_wait_ = cpu_wait; }
+      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent)
+    {
+      cpu_wait_ = cpu_wait;
+    }

  //! The actual command implementation.
  virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); }
 };

+class AccumulateCommand : public Command {
+ public:
+  //! Create a new Marker
+  AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
+         const Event* waitingEvent = nullptr)
+      : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {}
+
+  //! The command implementation
+  virtual void submit(device::VirtualDevice& device) {
+    device.submitAccumulate(*this);
+  }
+};
+
 /*! \brief  Maps CL objects created from external ones and syncs the contents (blocking).
 *
 */
@@ -1516,8 +1534,9 @@ class SvmFreeMemoryCommand : public Command {
  void* userData_;                  //!< Data passed to user-defined callback

 public:
-  SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, uint32_t numSvmPointers,
-                       void** svmPointers, freeCallBack pfnFreeFunc, void* userData)
+  SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList,
+                       uint32_t numSvmPointers, void** svmPointers,
+                       freeCallBack pfnFreeFunc, void* userData)
      : Command(queue, CL_COMMAND_SVM_FREE, eventWaitList),
        //! We copy svmPointers since it can be reused/deallocated after
        //  command creation