SWDEV-301667 - Create TS for each node recorded in graph

- Create a vector to allow multiple TS to be stored in Command. - This would mean we dont wait for entire batch in Accumulate command to finish when we exhaust signals. - Reduce the number of signals created at init to 64. This min value may still need to be tuned but the KFD allows max of 4094 interrupt signals per device. - Store kernel names whenever they are available and not just when profiling. If we dynamically enable profiling like for Torch, a crash can happen if hipGraphInstantiate wasnt included in Torch profile scope beacuse we previously entered kernel names only when profiler is attached. Change-Id: I34e7881a25bbc763f82fdeb3408a8ea58e1ec006 [ROCm/clr commit: c157bfb202]
2024-03-21 18:26:01 +00:00
@@ -769,7 +769,7 @@ RUNTIME_ENTRY_RET(cl_event, clCreateEventFromGLsyncKHR,
  // initially set the status of fence as queued
  clglEvent->setStatus(CL_SUBMITTED);
  // store GLsync id of the fence in event in order to associate them together
-  clglEvent->setData(clGLsync);
+  clglEvent->data().emplace_back(clGLsync);
  amd::Event* evt = clglEvent;
  evt->retain();
  *not_null(errcode_ret) = CL_SUCCESS;
@@ -3317,7 +3317,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
      return;
    }
    // Save the TimeStamp object in the current OCL event
-    command.setData(ts);
+    command.data().emplace_back(ts);
    profileTs_ = ts;
    state_.profileEnabled_ = true;
  }
@@ -3325,7 +3325,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {

 void VirtualGPU::profilingEnd(amd::Command& command) {
  // Get the TimeStamp object associated witht the current command
-  TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
+  TimeStamp* ts = !command.data().empty() ? reinterpret_cast<TimeStamp*>(command.data().back())
+                                            : nullptr;
  if (ts != nullptr) {
    // Check if the command actually did any GPU submission
    if (ts->isValid()) {
@@ -3333,7 +3334,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
    } else {
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
-      command.setData(nullptr);
+      command.data().clear();
    }
  }
 }
@@ -3362,7 +3363,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
-    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+    TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
+                                            : nullptr;

    if (ts != nullptr) {
      ts->value(&startTimeStamp, &endTimeStamp);
@@ -3379,7 +3381,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
-    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+    TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
+                                            : nullptr;

    current = first->getNext();

@@ -3389,7 +3392,7 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
      startTimeStamp -= readjustTimeGPU_;
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
-      first->setData(nullptr);
+      first->data().clear();
    } else {
      // For empty commands start/end is equal to
      // the end of the last valid command
@@ -194,23 +194,25 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
      head = ts->command().GetBatchHead();
    }
    while (head != nullptr) {
-      if (head->data() != nullptr) {
-        Timestamp* headTs  = reinterpret_cast<Timestamp*>(head->data());
-        ts->setParsedCommand(head);
-        for (auto it : headTs->Signals()) {
-          hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
-          if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
-            hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
-                                 HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
-                                 &HsaAmdSignalHandler, ts);
-            if (HSA_STATUS_SUCCESS != result) {
-              LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
-            } else {
-              ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
-                      "handle(0x%lx)", static_cast<uint32_t>(val), headTs,
-                      headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
+      if (!head->data().empty()) {
+        for (auto i = 0; i < head->data().size(); i++) {
+          Timestamp* headTs  = reinterpret_cast<Timestamp*>(head->data()[i]);
+          ts->setParsedCommand(head);
+          for (auto it : headTs->Signals()) {
+            hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
+            if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
+              hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
+                                  HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
+                                  &HsaAmdSignalHandler, ts);
+              if (HSA_STATUS_SUCCESS != result) {
+                LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
+              } else {
+                ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
+                        "handle(0x%lx)", static_cast<uint32_t>(val), headTs,
+                        headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
+              }
+              return false;
            }
-            return false;
          }
        }
      }
@@ -356,9 +358,7 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
 // ================================================================================================
 bool VirtualGPU::HwQueueTracker::Create() {
  uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
-  if (activity_prof::IsEnabled(OP_ID_DISPATCH) || gpu_.profiling_) {
-    kSignalListSize = !flagIsDefault(ROC_SIGNAL_POOL_SIZE) ? ROC_SIGNAL_POOL_SIZE : 4 * Ki;
-  }
+
  signal_list_.resize(kSignalListSize);

  hsa_agent_t agent = gpu_.gpu_device();
@@ -981,14 +981,14 @@ bool VirtualGPU::dispatchAqlPacket(
 inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
  amd::ScopedLock lock(execution());
  if (vcmd != nullptr) {
-    profilingBegin(*vcmd, true, true);
+    profilingBegin(*vcmd, true);
  }
  dispatchBlockingWait();
  constexpr size_t kPacketSize = 1;
  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
  if (vcmd != nullptr) {
-    profilingEnd(*vcmd, true);
+    profilingEnd(*vcmd);
  }
  return true;
 }
@@ -1455,23 +1455,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
-void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
+void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
  if (command.profilingInfo().enabled_) {
    if (timestamp_ != nullptr) {
      LogWarning("Trying to create a second timestamp in VirtualGPU. \
                  This could have unintended consequences.");
      return;
    }
-    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;

-    if (ts == nullptr) {
-      // Without barrier profiling will wait for each individual signal
-      timestamp_ = new Timestamp(this, command);
-      command.setData(timestamp_);
-      timestamp_->start();
-    } else {
-      timestamp_ = ts;
-    }
+    // Without barrier profiling will wait for each individual signal
+    timestamp_ = new Timestamp(this, command);
+    command.data().emplace_back(timestamp_);
+    timestamp_->start();

    // Enable SDMA profiling on the first access if profiling is set
    // Its not per command basis
@@ -1504,11 +1499,10 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
+void VirtualGPU::profilingEnd(amd::Command& command) {
  if (command.profilingInfo().enabled_) {
-    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
-    if (ts->HwProfiling() == false) {
-      ts->end();
+    if (timestamp_->HwProfiling() == false) {
+      timestamp_->end();
    }
    timestamp_ = nullptr;
  }
@@ -1541,8 +1535,8 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
    // came before it to start and end with this first valid start time.
    current = list;
    while (current != nullptr) {
-      if (current->data() != nullptr) {
-        ts = reinterpret_cast<Timestamp*>(current->data());
+      if (!current->data().empty()) {
+        ts = reinterpret_cast<Timestamp*>(current->data().back());
        ts->getTime(&startTimeStamp, &endTimeStamp);
        break;
      }
@@ -1564,13 +1558,15 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
  current = list;
  while (current != nullptr) {
    if (current->profilingInfo().enabled_) {
-      if (current->data() != nullptr) {
-        // Since this is a valid command to get a timestamp, we use the
-        // timestamp provided by the runtime (saved in the data())
-        ts = reinterpret_cast<Timestamp*>(current->data());
-        ts->getTime(&startTimeStamp, &endTimeStamp);
-        ts->release();
-        current->setData(nullptr);
+      if (!current->data().empty()) {
+        for (auto i = 0; i < current->data().size(); i++) {
+          // Since this is a valid command to get a timestamp, we use the
+          // timestamp provided by the runtime (saved in the data())
+          ts = reinterpret_cast<Timestamp*>(current->data()[i]);
+          ts->getTime(&startTimeStamp, &endTimeStamp);
+          ts->release();
+        }
+        current->data().clear();
      } else {
        // If we don't have a command that contains a valid timestamp,
        // we simply use the end timestamp of the previous command.
@@ -3467,7 +3463,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
 void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
-  profilingBegin(vcmd, true, true);
+  profilingBegin(vcmd, true);

  uint8_t* aqlPacket = vcmd.getLastPacket();
  if (aqlPacket != nullptr) {
@@ -3489,7 +3485,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
    }
  }

-  profilingEnd(vcmd, true);
+  profilingEnd(vcmd);
 }

 // ================================================================================================
@@ -306,8 +306,8 @@ class VirtualGPU : public device::VirtualDevice {
  bool create();
  const Device& dev() const { return roc_device_; }

-  void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
-  void profilingEnd(amd::Command& command, bool useCommandTs = false);
+  void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
+  void profilingEnd(amd::Command& command);

  void updateCommandsState(amd::Command* list) const;

@@ -317,7 +317,6 @@ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& ev
      queue_(&queue),
      next_(nullptr),
      type_(type),
-      data_(nullptr),
      waitingEvent_(waitingEvent),
      eventWaitList_(eventWaitList),
      commandWaitBits_(commandWaitBits) {
@@ -258,7 +258,7 @@ class Command : public Event {
  Command* next_;                  //!< Next GPU command in the queue list
  Command* batch_head_ = nullptr;  //!< The head of the batch commands
  cl_command_type type_;           //!< This command's OpenCL type.
-  void* data_;
+  std::vector<void*> data_;
  const Event* waitingEvent_;  //!< Waiting event associated with the marker

 protected:
@@ -282,7 +282,6 @@ class Command : public Event {
        queue_(nullptr),
        next_(nullptr),
        type_(type),
-        data_(nullptr),
        waitingEvent_(nullptr),
        eventWaitList_(nullWaitList),
        commandWaitBits_(0) {}
@@ -322,11 +321,9 @@ class Command : public Event {
  //! Return this command's OpenCL type.
  cl_command_type type() const { return type_; }

-  //! Return the opaque, device specific data for this command.
-  void* data() const { return data_; }
+  //! Return the opaque, device specific data vector for this command.
+  std::vector<void*>& data() { return data_; }

-  //! Set the opaque, device specific data for this command.
-  void setData(void* data) { data_ = data; }

  /*! \brief The execution engine for this command.
   *
@@ -1273,17 +1270,13 @@ class AccumulateCommand : public Command {

  //! Add kernel name to the list if available
  void addKernelName(const std::string& kernelName) {
-    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
-      // "^" is to indicate kernel is captured at instantiate
-      kernelNames_.push_back("^  " + kernelName);
-    }
+    // "^" is to indicate kernel is captured at instantiate
+    kernelNames_.push_back("^  " + kernelName);
  }

  //! Add kernel timestamp to the list if available
  void addTimestamps(uint64_t startTs, uint64_t endTs) {
-    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
-      tsList_.push_back(std::make_pair(startTs, endTs));
-    }
+    tsList_.push_back(std::make_pair(startTs, endTs));
  }

  //! Return the kernel names
@@ -51,7 +51,8 @@
 bool amd::ClGlEvent::waitForFence() {
  GLenum ret;
  // get fence id associated with fence event
-  GLsync gs = reinterpret_cast<GLsync>(command().data());
+  GLsync gs = !command().data().empty() ? reinterpret_cast<GLsync>(command().data().back())
+                                          : nullptr;
  if (!gs) return false;

 // Try to use DC and GLRC of current thread, if it doesn't exist
@@ -219,7 +219,7 @@ release(uint, ROC_P2P_SDMA_SIZE, 1024,                                        \
        "The minimum size in KB for P2P transfer with SDMA")                  \
 release(uint, ROC_AQL_QUEUE_SIZE, 16384,                                      \
        "AQL queue size in AQL packets")                                      \
-release(uint, ROC_SIGNAL_POOL_SIZE, 4096,                                     \
+release(uint, ROC_SIGNAL_POOL_SIZE, 64,                                       \
        "Initial size of HSA signal pool")                                    \
 release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16,                                    \
        "Limit the number of workgroups in blit operations")                  \