SWDEV-422207 - Report kernel names for activity profiling

- Report kernel names for optimized graph path - Refactor code so that we store profiling info in Accumulate command Change-Id: Ib97735a0239aeb9fc3a50a4bb7126dd0bcadc8af [ROCm/clr commit: b056686607]
2023-11-15 04:27:56 +00:00
@@ -551,6 +551,7 @@ hipError_t GraphExec::Run(hipStream_t stream) {
    for (int i = 0; i < topoOrder_.size() - 1; i++) {
      if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
        hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
+        accumulate->addKernelName(topoOrder_[i]->GetKernelName());
      } else {
        topoOrder_[i]->SetStream(hip_stream, this);
        status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -561,6 +562,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
    // If last captured packet is kernel, optimize to detect completion of last kernel
    // This saves on extra packet submitted to determine end of graph
    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
+      // Add the last kernel node name to the accumulate command
+      accumulate->addKernelName(topoOrder_.back()->GetKernelName());
      accumulate->enqueue();
      accumulate->release();
      isLastPacketKernel = true;
@@ -183,6 +183,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
  static amd::Monitor nodeSetLock_;
  unsigned int isEnabled_;
  uint8_t gpuPacket_[64];  //!< GPU Packet to enqueue during graph launch
+  std::string capturedKernelName_;

 public:
  GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
@@ -232,7 +233,10 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
  }
  // Return gpu packet address to update with actual packet under capture.
  uint8_t* GetAqlPacket() { return gpuPacket_; }
-  hip::Stream* GetQueue() { return stream_; }
+  void SetKernelName(std::string kernelName) { capturedKernelName_ = kernelName; }
+  const std::string& GetKernelName() const { return capturedKernelName_; }
+
+  hip::Stream* GetQueue() const { return stream_; }

  virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
    stream_ = stream;
@@ -805,8 +809,11 @@ class GraphKernelNode : public GraphNode {
        reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
            true, GetAqlPacket(), kernArgOffset);

-        // Enqueue command to capture GPU Packet. Packet is not sent to hardware queue.
+        // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
+        // The packet is stored in gpuPacket_ and submitted during graph launch.
        command->submit(*(command->queue())->vdev());
+        // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
+        SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
        command->release();
      }
    }
@@ -135,7 +135,7 @@ void Timestamp::checkGpuTime() {
      // Avoid profiling data for the sync barrier, in tiny performance tests the first call
      // to ROCr is very slow and that also affects the overall performance of the callback thread
      if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_
-          || command().profilingInfo().multiple_ts_) {
+          || command().type() == CL_COMMAND_TASK) {
        hsa_amd_profiling_dispatch_time_t time = {};
        if (it->engine_ == HwQueueEngine::Compute) {
          hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
@@ -149,8 +149,8 @@ void Timestamp::checkGpuTime() {
        start = std::min(time.start, start);
        end = std::max(time.end, end);

-        if (command().profilingInfo().multiple_ts_) {
-          command().AddTimeStamps(time.start, time.end);
+        if (command().type() == CL_COMMAND_TASK) {
+          static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
        }

        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
@@ -101,10 +101,14 @@ void ReportActivity(const amd::Command& command) {
      break;
  }

-  if (command.profilingInfo().tsList_.size() > 0) {
-    for (auto& it : command.profilingInfo().tsList_) {
+  if (command.type() == CL_COMMAND_TASK) {
+    auto timestamps = static_cast<const amd::AccumulateCommand&>(command).getTimestamps();
+    for (uint32_t i = 0; i < timestamps.size(); i++) {
+      auto it = timestamps[i];
      record.begin_ns = it.first;
      record.end_ns = it.second;
+      record.kernel_name =
+        static_cast<const amd::AccumulateCommand&>(command).getKernelNames()[i].c_str();
      function(ACTIVITY_DOMAIN_HIP_OPS, operation_id, &record);
    }
  } else {
@@ -105,7 +105,7 @@ class Event : public RuntimeObject {

  struct ProfilingInfo {
    ProfilingInfo(bool enabled = false)
-      : enabled_(enabled), marker_ts_(false), multiple_ts_(false) {
+      : enabled_(enabled), marker_ts_(false) {
      if (enabled) {
        clear();
        correlation_id_ = activity_prof::correlation_id;
@@ -116,20 +116,17 @@ class Event : public RuntimeObject {
    uint64_t submitted_;
    uint64_t start_;
    uint64_t end_;
-    std::vector<std::pair<uint64_t, uint64_t>> tsList_;

    uint64_t correlation_id_;
    bool enabled_;        //!< Profiling enabled for the wave limiter
    bool marker_ts_;      //!< TS marker
-    bool multiple_ts_;    //!< Multiple TS

   void clear() {
      queued_ = 0ULL;
      submitted_ = 0ULL;
      start_ = 0ULL;
      end_ = 0ULL;
-      tsList_.clear();
-    }
+     }
  } profilingInfo_;

  //! Construct a new event.
@@ -225,11 +222,6 @@ class Event : public RuntimeObject {

  //! Set release scope for the event
  void setEventScope(int32_t scope) { event_scope_ = scope; }
-
-  //! Add a timestamp to the list
-  void AddTimeStamps(uint64_t start, uint64_t end) {
-    profilingInfo_.tsList_.push_back(std::make_pair(start, end));
-  }
 };

 union CopyMetadata {
@@ -1263,17 +1255,45 @@ class Marker : public Command {
 class AccumulateCommand : public Command {
 private:
  uint8_t* lastPacket_;
+
+  //! Kernel names and timestamps list for activity profiling
+  std::vector<std::string> kernelNames_;
+  std::vector<std::pair<uint64_t, uint64_t>> tsList_;
+
 public:
  //! Create a new Marker
  AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
         const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr)
      : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent),
        lastPacket_(lastPacket)
-      {
-        profilingInfo_.multiple_ts_ = true;
-      }
-  // Return last packet
+      {}
+  //! Return last packet
  uint8_t* getLastPacket() const { return lastPacket_; }
+
+  //! Add kernel name to the list if available
+  void addKernelName(const std::string& kernelName) {
+    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
+      kernelNames_.push_back(kernelName);
+    }
+  }
+
+  //! Add kernel timestamp to the list if available
+  void addTimestamps(uint64_t startTs, uint64_t endTs) {
+    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
+      tsList_.push_back(std::make_pair(startTs, endTs));
+    }
+  }
+
+  //! Return the kernel names
+  const std::vector<std::string>& getKernelNames() const {
+    return kernelNames_;
+  }
+
+  //! Return the kernel timestamps
+  const std::vector<std::pair<uint64_t, uint64_t>>& getTimestamps() const {
+    return tsList_;
+  }
+
  //! The command implementation
  virtual void submit(device::VirtualDevice& device) {
    device.submitAccumulate(*this);