From cb9a715e04a404c0f1a42e8da52a148f5effabb0 Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Wed, 15 Nov 2023 04:27:56 +0000
Subject: [PATCH] SWDEV-422207 - Report kernel names for activity profiling

- Report kernel names for optimized graph path
- Refactor code so that we store profiling info in Accumulate command

Change-Id: Ib97735a0239aeb9fc3a50a4bb7126dd0bcadc8af


[ROCm/clr commit: b05668660777d6f530ff928b80153292ebff1075]
---
 .../clr/hipamd/src/hip_graph_internal.cpp     |  3 ++
 .../clr/hipamd/src/hip_graph_internal.hpp     | 11 ++++-
 .../clr/rocclr/device/rocm/rocvirtual.cpp     |  6 +--
 projects/clr/rocclr/platform/activity.cpp     |  8 +++-
 projects/clr/rocclr/platform/command.hpp      | 48 +++++++++++++------
 5 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index d4d07c4753..6886bc03c6 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -551,6 +551,7 @@ hipError_t GraphExec::Run(hipStream_t stream) {
     for (int i = 0; i < topoOrder_.size() - 1; i++) {
       if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
         hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
+        accumulate->addKernelName(topoOrder_[i]->GetKernelName());
       } else {
         topoOrder_[i]->SetStream(hip_stream, this);
         status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -561,6 +562,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
     // If last captured packet is kernel, optimize to detect completion of last kernel
     // This saves on extra packet submitted to determine end of graph
     if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
+      // Add the last kernel node name to the accumulate command
+      accumulate->addKernelName(topoOrder_.back()->GetKernelName());
       accumulate->enqueue();
       accumulate->release();
       isLastPacketKernel = true;
diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp
index 27892301c4..61bab5bb49 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.hpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.hpp
@@ -183,6 +183,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   static amd::Monitor nodeSetLock_;
   unsigned int isEnabled_;
   uint8_t gpuPacket_[64];  //!< GPU Packet to enqueue during graph launch
+  std::string capturedKernelName_;
 
  public:
   GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
@@ -232,7 +233,10 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   }
   // Return gpu packet address to update with actual packet under capture.
   uint8_t* GetAqlPacket() { return gpuPacket_; }
-  hip::Stream* GetQueue() { return stream_; }
+  void SetKernelName(std::string kernelName) { capturedKernelName_ = kernelName; }
+  const std::string& GetKernelName() const { return capturedKernelName_; }
+
+  hip::Stream* GetQueue() const { return stream_; }
 
   virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
     stream_ = stream;
@@ -805,8 +809,11 @@ class GraphKernelNode : public GraphNode {
         reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
             true, GetAqlPacket(), kernArgOffset);
 
-        // Enqueue command to capture GPU Packet. Packet is not sent to hardware queue.
+        // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
+        // The packet is stored in gpuPacket_ and submitted during graph launch.
         command->submit(*(command->queue())->vdev());
+        // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
+        SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
         command->release();
       }
     }
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 652e195838..09674b96fd 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -135,7 +135,7 @@ void Timestamp::checkGpuTime() {
       // Avoid profiling data for the sync barrier, in tiny performance tests the first call
       // to ROCr is very slow and that also affects the overall performance of the callback thread
       if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_
-          || command().profilingInfo().multiple_ts_) {
+          || command().type() == CL_COMMAND_TASK) {
         hsa_amd_profiling_dispatch_time_t time = {};
         if (it->engine_ == HwQueueEngine::Compute) {
           hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
@@ -149,8 +149,8 @@ void Timestamp::checkGpuTime() {
         start = std::min(time.start, start);
         end = std::max(time.end, end);
 
-        if (command().profilingInfo().multiple_ts_) {
-          command().AddTimeStamps(time.start, time.end);
+        if (command().type() == CL_COMMAND_TASK) {
+          static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
         }
 
         ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
diff --git a/projects/clr/rocclr/platform/activity.cpp b/projects/clr/rocclr/platform/activity.cpp
index bbe142202d..e90c5a7b08 100644
--- a/projects/clr/rocclr/platform/activity.cpp
+++ b/projects/clr/rocclr/platform/activity.cpp
@@ -101,10 +101,14 @@ void ReportActivity(const amd::Command& command) {
       break;
   }
 
-  if (command.profilingInfo().tsList_.size() > 0) {
-    for (auto& it : command.profilingInfo().tsList_) {
+  if (command.type() == CL_COMMAND_TASK) {
+    auto timestamps = static_cast<const amd::AccumulateCommand&>(command).getTimestamps();
+    for (uint32_t i = 0; i < timestamps.size(); i++) {
+      auto it = timestamps[i];
       record.begin_ns = it.first;
       record.end_ns = it.second;
+      record.kernel_name =
+        static_cast<const amd::AccumulateCommand&>(command).getKernelNames()[i].c_str();
       function(ACTIVITY_DOMAIN_HIP_OPS, operation_id, &record);
     }
   } else {
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index beb852ec9a..3fda3eee69 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -105,7 +105,7 @@ class Event : public RuntimeObject {
 
   struct ProfilingInfo {
     ProfilingInfo(bool enabled = false)
-      : enabled_(enabled), marker_ts_(false), multiple_ts_(false) {
+      : enabled_(enabled), marker_ts_(false) {
       if (enabled) {
         clear();
         correlation_id_ = activity_prof::correlation_id;
@@ -116,20 +116,17 @@ class Event : public RuntimeObject {
     uint64_t submitted_;
     uint64_t start_;
     uint64_t end_;
-    std::vector<std::pair<uint64_t, uint64_t>> tsList_;
 
     uint64_t correlation_id_;
     bool enabled_;        //!< Profiling enabled for the wave limiter
     bool marker_ts_;      //!< TS marker
-    bool multiple_ts_;    //!< Multiple TS
 
    void clear() {
       queued_ = 0ULL;
       submitted_ = 0ULL;
       start_ = 0ULL;
       end_ = 0ULL;
-      tsList_.clear();
-    }
+     }
   } profilingInfo_;
 
   //! Construct a new event.
@@ -225,11 +222,6 @@ class Event : public RuntimeObject {
 
   //! Set release scope for the event
   void setEventScope(int32_t scope) { event_scope_ = scope; }
-
-  //! Add a timestamp to the list
-  void AddTimeStamps(uint64_t start, uint64_t end) {
-    profilingInfo_.tsList_.push_back(std::make_pair(start, end));
-  }
 };
 
 union CopyMetadata {
@@ -1263,17 +1255,45 @@ class Marker : public Command {
 class AccumulateCommand : public Command {
  private:
   uint8_t* lastPacket_;
+
+  //! Kernel names and timestamps list for activity profiling
+  std::vector<std::string> kernelNames_;
+  std::vector<std::pair<uint64_t, uint64_t>> tsList_;
+
  public:
   //! Create a new Marker
   AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
          const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr)
       : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent),
         lastPacket_(lastPacket)
-      {
-        profilingInfo_.multiple_ts_ = true;
-      }
-  // Return last packet
+      {}
+  //! Return last packet
   uint8_t* getLastPacket() const { return lastPacket_; }
+
+  //! Add kernel name to the list if available
+  void addKernelName(const std::string& kernelName) {
+    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
+      kernelNames_.push_back(kernelName);
+    }
+  }
+
+  //! Add kernel timestamp to the list if available
+  void addTimestamps(uint64_t startTs, uint64_t endTs) {
+    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
+      tsList_.push_back(std::make_pair(startTs, endTs));
+    }
+  }
+
+  //! Return the kernel names
+  const std::vector<std::string>& getKernelNames() const {
+    return kernelNames_;
+  }
+
+  //! Return the kernel timestamps
+  const std::vector<std::pair<uint64_t, uint64_t>>& getTimestamps() const {
+    return tsList_;
+  }
+
   //! The command implementation
   virtual void submit(device::VirtualDevice& device) {
     device.submitAccumulate(*this);