From f3aedfbec048cd0f9df035eaf88b40a728cb3c77 Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Thu, 21 Mar 2024 18:26:01 +0000
Subject: [PATCH] SWDEV-301667 - Create TS for each node recorded in graph

- Create a vector to allow multiple TS to be stored in Command.
- This would mean we dont wait for entire batch in Accumulate command
to finish when we exhaust signals.
- Reduce the number of signals created at init to 64. This min value
may still need to be tuned but the KFD allows max of 4094 interrupt
signals per device.
- Store kernel names whenever they are available and not just when
profiling. If we dynamically enable profiling like for Torch, a crash
can happen if hipGraphInstantiate wasnt included in Torch profile scope
beacuse we previously entered kernel names only when profiler is
attached.

Change-Id: I34e7881a25bbc763f82fdeb3408a8ea58e1ec006


[ROCm/clr commit: c157bfb2022076959c521269b27f34996c1ee730]
---
 projects/clr/opencl/amdocl/cl_gl.cpp          |  2 +-
 projects/clr/rocclr/device/pal/palvirtual.cpp | 15 ++--
 .../clr/rocclr/device/rocm/rocvirtual.cpp     | 88 +++++++++----------
 .../clr/rocclr/device/rocm/rocvirtual.hpp     |  4 +-
 projects/clr/rocclr/platform/command.cpp      |  1 -
 projects/clr/rocclr/platform/command.hpp      | 19 ++--
 projects/clr/rocclr/platform/interop_gl.cpp   |  3 +-
 projects/clr/rocclr/utils/flags.hpp           |  2 +-
 8 files changed, 63 insertions(+), 71 deletions(-)

diff --git a/projects/clr/opencl/amdocl/cl_gl.cpp b/projects/clr/opencl/amdocl/cl_gl.cpp
index a6a2851d11..7ccfb0c00b 100644
--- a/projects/clr/opencl/amdocl/cl_gl.cpp
+++ b/projects/clr/opencl/amdocl/cl_gl.cpp
@@ -769,7 +769,7 @@ RUNTIME_ENTRY_RET(cl_event, clCreateEventFromGLsyncKHR,
   // initially set the status of fence as queued
   clglEvent->setStatus(CL_SUBMITTED);
   // store GLsync id of the fence in event in order to associate them together
-  clglEvent->setData(clGLsync);
+  clglEvent->data().emplace_back(clGLsync);
   amd::Event* evt = clglEvent;
   evt->retain();
   *not_null(errcode_ret) = CL_SUCCESS;
diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp
index da61f82314..388b68a028 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.cpp
@@ -3317,7 +3317,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
       return;
     }
     // Save the TimeStamp object in the current OCL event
-    command.setData(ts);
+    command.data().emplace_back(ts);
     profileTs_ = ts;
     state_.profileEnabled_ = true;
   }
@@ -3325,7 +3325,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
 
 void VirtualGPU::profilingEnd(amd::Command& command) {
   // Get the TimeStamp object associated witht the current command
-  TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
+  TimeStamp* ts = !command.data().empty() ? reinterpret_cast<TimeStamp*>(command.data().back())
+                                            : nullptr;
   if (ts != nullptr) {
     // Check if the command actually did any GPU submission
     if (ts->isValid()) {
@@ -3333,7 +3334,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
     } else {
       // Destroy the TimeStamp object
       tsCache_->freeTimeStamp(ts);
-      command.setData(nullptr);
+      command.data().clear();
     }
   }
 }
@@ -3362,7 +3363,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
   first = cb->head_;
   while (nullptr != first) {
     // Get the TimeStamp object associated witht the current command
-    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+    TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
+                                            : nullptr;
 
     if (ts != nullptr) {
       ts->value(&startTimeStamp, &endTimeStamp);
@@ -3379,7 +3381,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
   first = cb->head_;
   while (nullptr != first) {
     // Get the TimeStamp object associated witht the current command
-    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+    TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
+                                            : nullptr;
 
     current = first->getNext();
 
@@ -3389,7 +3392,7 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
       startTimeStamp -= readjustTimeGPU_;
       // Destroy the TimeStamp object
       tsCache_->freeTimeStamp(ts);
-      first->setData(nullptr);
+      first->data().clear();
     } else {
       // For empty commands start/end is equal to
       // the end of the last valid command
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 1610aecfe1..54c6da78d2 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -194,23 +194,25 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
       head = ts->command().GetBatchHead();
     }
     while (head != nullptr) {
-      if (head->data() != nullptr) {
-        Timestamp* headTs  = reinterpret_cast<Timestamp*>(head->data());
-        ts->setParsedCommand(head);
-        for (auto it : headTs->Signals()) {
-          hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
-          if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
-            hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
-                                 HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
-                                 &HsaAmdSignalHandler, ts);
-            if (HSA_STATUS_SUCCESS != result) {
-              LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
-            } else {
-              ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
-                      "handle(0x%lx)", static_cast<uint32_t>(val), headTs,
-                      headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
+      if (!head->data().empty()) {
+        for (auto i = 0; i < head->data().size(); i++) {
+          Timestamp* headTs  = reinterpret_cast<Timestamp*>(head->data()[i]);
+          ts->setParsedCommand(head);
+          for (auto it : headTs->Signals()) {
+            hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
+            if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
+              hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
+                                  HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
+                                  &HsaAmdSignalHandler, ts);
+              if (HSA_STATUS_SUCCESS != result) {
+                LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
+              } else {
+                ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
+                        "handle(0x%lx)", static_cast<uint32_t>(val), headTs,
+                        headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
+              }
+              return false;
             }
-            return false;
           }
         }
       }
@@ -356,9 +358,7 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
 // ================================================================================================
 bool VirtualGPU::HwQueueTracker::Create() {
   uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
-  if (activity_prof::IsEnabled(OP_ID_DISPATCH) || gpu_.profiling_) {
-    kSignalListSize = !flagIsDefault(ROC_SIGNAL_POOL_SIZE) ? ROC_SIGNAL_POOL_SIZE : 4 * Ki;
-  }
+
   signal_list_.resize(kSignalListSize);
 
   hsa_agent_t agent = gpu_.gpu_device();
@@ -981,14 +981,14 @@ bool VirtualGPU::dispatchAqlPacket(
 inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
   amd::ScopedLock lock(execution());
   if (vcmd != nullptr) {
-    profilingBegin(*vcmd, true, true);
+    profilingBegin(*vcmd, true);
   }
   dispatchBlockingWait();
   constexpr size_t kPacketSize = 1;
   auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
   dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
   if (vcmd != nullptr) {
-    profilingEnd(*vcmd, true);
+    profilingEnd(*vcmd);
   }
   return true;
 }
@@ -1455,23 +1455,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
-void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
+void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
   if (command.profilingInfo().enabled_) {
     if (timestamp_ != nullptr) {
       LogWarning("Trying to create a second timestamp in VirtualGPU. \
                   This could have unintended consequences.");
       return;
     }
-    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
 
-    if (ts == nullptr) {
-      // Without barrier profiling will wait for each individual signal
-      timestamp_ = new Timestamp(this, command);
-      command.setData(timestamp_);
-      timestamp_->start();
-    } else {
-      timestamp_ = ts;
-    }
+    // Without barrier profiling will wait for each individual signal
+    timestamp_ = new Timestamp(this, command);
+    command.data().emplace_back(timestamp_);
+    timestamp_->start();
 
     // Enable SDMA profiling on the first access if profiling is set
     // Its not per command basis
@@ -1504,11 +1499,10 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
+void VirtualGPU::profilingEnd(amd::Command& command) {
   if (command.profilingInfo().enabled_) {
-    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
-    if (ts->HwProfiling() == false) {
-      ts->end();
+    if (timestamp_->HwProfiling() == false) {
+      timestamp_->end();
     }
     timestamp_ = nullptr;
   }
@@ -1541,8 +1535,8 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
     // came before it to start and end with this first valid start time.
     current = list;
     while (current != nullptr) {
-      if (current->data() != nullptr) {
-        ts = reinterpret_cast<Timestamp*>(current->data());
+      if (!current->data().empty()) {
+        ts = reinterpret_cast<Timestamp*>(current->data().back());
         ts->getTime(&startTimeStamp, &endTimeStamp);
         break;
       }
@@ -1564,13 +1558,15 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
   current = list;
   while (current != nullptr) {
     if (current->profilingInfo().enabled_) {
-      if (current->data() != nullptr) {
-        // Since this is a valid command to get a timestamp, we use the
-        // timestamp provided by the runtime (saved in the data())
-        ts = reinterpret_cast<Timestamp*>(current->data());
-        ts->getTime(&startTimeStamp, &endTimeStamp);
-        ts->release();
-        current->setData(nullptr);
+      if (!current->data().empty()) {
+        for (auto i = 0; i < current->data().size(); i++) {
+          // Since this is a valid command to get a timestamp, we use the
+          // timestamp provided by the runtime (saved in the data())
+          ts = reinterpret_cast<Timestamp*>(current->data()[i]);
+          ts->getTime(&startTimeStamp, &endTimeStamp);
+          ts->release();
+        }
+        current->data().clear();
       } else {
         // If we don't have a command that contains a valid timestamp,
         // we simply use the end timestamp of the previous command.
@@ -3467,7 +3463,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
 void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
-  profilingBegin(vcmd, true, true);
+  profilingBegin(vcmd, true);
 
   uint8_t* aqlPacket = vcmd.getLastPacket();
   if (aqlPacket != nullptr) {
@@ -3489,7 +3485,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
     }
   }
 
-  profilingEnd(vcmd, true);
+  profilingEnd(vcmd);
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 25a5715642..c5f8fa4df2 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -306,8 +306,8 @@ class VirtualGPU : public device::VirtualDevice {
   bool create();
   const Device& dev() const { return roc_device_; }
 
-  void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
-  void profilingEnd(amd::Command& command, bool useCommandTs = false);
+  void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
+  void profilingEnd(amd::Command& command);
 
   void updateCommandsState(amd::Command* list) const;
 
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index 7e078c9a68..aec37d179b 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -317,7 +317,6 @@ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& ev
       queue_(&queue),
       next_(nullptr),
       type_(type),
-      data_(nullptr),
       waitingEvent_(waitingEvent),
       eventWaitList_(eventWaitList),
       commandWaitBits_(commandWaitBits) {
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index cbd837a2d6..d60c95db66 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -258,7 +258,7 @@ class Command : public Event {
   Command* next_;                  //!< Next GPU command in the queue list
   Command* batch_head_ = nullptr;  //!< The head of the batch commands
   cl_command_type type_;           //!< This command's OpenCL type.
-  void* data_;
+  std::vector<void*> data_;
   const Event* waitingEvent_;  //!< Waiting event associated with the marker
 
  protected:
@@ -282,7 +282,6 @@ class Command : public Event {
         queue_(nullptr),
         next_(nullptr),
         type_(type),
-        data_(nullptr),
         waitingEvent_(nullptr),
         eventWaitList_(nullWaitList),
         commandWaitBits_(0) {}
@@ -322,11 +321,9 @@ class Command : public Event {
   //! Return this command's OpenCL type.
   cl_command_type type() const { return type_; }
 
-  //! Return the opaque, device specific data for this command.
-  void* data() const { return data_; }
+  //! Return the opaque, device specific data vector for this command.
+  std::vector<void*>& data() { return data_; }
 
-  //! Set the opaque, device specific data for this command.
-  void setData(void* data) { data_ = data; }
 
   /*! \brief The execution engine for this command.
    *
@@ -1273,17 +1270,13 @@ class AccumulateCommand : public Command {
 
   //! Add kernel name to the list if available
   void addKernelName(const std::string& kernelName) {
-    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
-      // "^" is to indicate kernel is captured at instantiate
-      kernelNames_.push_back("^  " + kernelName);
-    }
+    // "^" is to indicate kernel is captured at instantiate
+    kernelNames_.push_back("^  " + kernelName);
   }
 
   //! Add kernel timestamp to the list if available
   void addTimestamps(uint64_t startTs, uint64_t endTs) {
-    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
-      tsList_.push_back(std::make_pair(startTs, endTs));
-    }
+    tsList_.push_back(std::make_pair(startTs, endTs));
   }
 
   //! Return the kernel names
diff --git a/projects/clr/rocclr/platform/interop_gl.cpp b/projects/clr/rocclr/platform/interop_gl.cpp
index 342b034e69..20cf3a7d05 100644
--- a/projects/clr/rocclr/platform/interop_gl.cpp
+++ b/projects/clr/rocclr/platform/interop_gl.cpp
@@ -51,7 +51,8 @@
 bool amd::ClGlEvent::waitForFence() {
   GLenum ret;
   // get fence id associated with fence event
-  GLsync gs = reinterpret_cast<GLsync>(command().data());
+  GLsync gs = !command().data().empty() ? reinterpret_cast<GLsync>(command().data().back())
+                                          : nullptr;
   if (!gs) return false;
 
 // Try to use DC and GLRC of current thread, if it doesn't exist
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 72f41dc693..0230eddb25 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -219,7 +219,7 @@ release(uint, ROC_P2P_SDMA_SIZE, 1024,                                        \
         "The minimum size in KB for P2P transfer with SDMA")                  \
 release(uint, ROC_AQL_QUEUE_SIZE, 16384,                                      \
         "AQL queue size in AQL packets")                                      \
-release(uint, ROC_SIGNAL_POOL_SIZE, 4096,                                     \
+release(uint, ROC_SIGNAL_POOL_SIZE, 64,                                       \
         "Initial size of HSA signal pool")                                    \
 release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16,                                    \
         "Limit the number of workgroups in blit operations")                  \