From 346da4bb4026eb3db9b6e23ce6f237293f8d977d Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <anusha.godavarthysurya@amd.com>
Date: Tue, 18 Jun 2024 11:12:37 +0000
Subject: [PATCH] SWDEV-468424 - hipgraph capture memset node

Capture AQL packets during GraphInstantiation and enqueue AQL packets during graph launch.

Added support to capture single graph memset node.
Capture support for memset node is currently disabled.
Memset capture will be enabled when capture for multiple packets are supported..

Change-Id: I14dfbc41731025cc3a548a730558915def3fa384
---
 hipamd/src/hip_graph_internal.cpp | 19 +++++------
 hipamd/src/hip_graph_internal.hpp | 53 +++++++++++++++++--------------
 rocclr/device/rocm/rocvirtual.cpp | 19 +++++++----
 rocclr/device/rocm/rocvirtual.hpp |  2 ++
 rocclr/platform/command.hpp       | 49 ++++++++++++++++------------
 5 files changed, 83 insertions(+), 59 deletions(-)
diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp
index bf1ee627f9..3d44b10b67 100644
--- a/hipamd/src/hip_graph_internal.cpp
+++ b/hipamd/src/hip_graph_internal.cpp
@@ -362,9 +362,9 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
   // GPU packet capture is enabled for kernel nodes. Calculate the kernel
   // arg size required for all graph kernel nodes to allocate
   for (const auto& list : parallelLists) {
-    for (auto& node : list) {
-      if (node->GetType() == hipGraphNodeTypeKernel) {
-        kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
+    for (hip::GraphNode* node : list) {
+      if (node->GraphCaptureEnabled()) {
+        kernArgSizeForGraph += node->GetKerArgSize();
       } else if (node->GetType() == hipGraphNodeTypeGraph) {
         auto& childParallelLists =
             reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
@@ -388,18 +388,19 @@ hipError_t AllocKernelArgForGraphNode(std::vector<hip::Node>& topoOrder,
         graphExec->SetHiddenHeap();
         initialized = true;
       }
-      auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
-      // From the kernel pool allocate the kern arg size required for the current kernel node.
+    }
+    if (node->GraphCaptureEnabled()) {
+      // From the kernel pool allocate the kern arg size required for the current node.
       address kernArgOffset = nullptr;
-      if (kernelNode->GetKernargSegmentByteSize()) {
+      if (node->GetKernargSegmentByteSize()) {
         kernArgOffset = graphExec->kernArgManager_->AllocKernArg(
-            kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment());
+            node->GetKernargSegmentByteSize(), node->GetKernargSegmentAlignment());
         if (kernArgOffset == nullptr) {
           return hipErrorMemoryAllocation;
         }
       }
       // Form GPU packet capture for the kernel node.
-      kernelNode->CaptureAndFormPacket(capture_stream, kernArgOffset);
+      node->CaptureAndFormPacket(capture_stream, kernArgOffset);
     } else if (node->GetType() == hipGraphNodeTypeGraph) {
       auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
       auto& childParallelLists = childNode->GetParallelLists();
@@ -551,7 +552,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
     accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
   }
   for (int i = 0; i < topoOrder.size(); i++) {
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
+    if (topoOrder[i]->GraphCaptureEnabled()) {
       if (topoOrder[i]->GetEnabled()) {
         hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
                                               topoOrder[i]->GetKernelName(),
diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp
index 20499a202b..171bfb24da 100644
--- a/hipamd/src/hip_graph_internal.hpp
+++ b/hipamd/src/hip_graph_internal.hpp
@@ -186,6 +186,9 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   unsigned int isEnabled_;
   uint8_t gpuPacket_[64];  //!< GPU Packet to enqueue during graph launch
   std::string capturedKernelName_;
+  size_t alignedKernArgSize_ = 256;       //!< Aligned size required for kernel args
+  size_t kernargSegmentByteSize_ = 256;   //!< Kernel arg segment byte size
+  size_t kernargSegmentAlignment_ = 256;  //!< Kernel arg segment alignment
 
  public:
   GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
@@ -237,7 +240,19 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   uint8_t* GetAqlPacket() { return gpuPacket_; }
   void SetKernelName(const std::string& kernelName) { capturedKernelName_ = kernelName; }
   const std::string& GetKernelName() const { return capturedKernelName_; }
-
+  size_t GetKerArgSize() const { return alignedKernArgSize_; }
+  size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
+  size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
+  void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
+    hipError_t status = CreateCommand(capture_stream);
+    for (auto& command : commands_) {
+      command->setCapturingState(true, GetAqlPacket(), kernArgOffset, &capturedKernelName_);
+      // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
+      // The packet is stored in gpuPacket_ and submitted during graph launch.
+      command->submit(*(command->queue())->vdev());
+      command->release();
+    }
+  }
   hip::Stream* GetQueue() const { return stream_; }
 
   virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
@@ -380,6 +395,20 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   }
   unsigned int GetEnabled() const { return isEnabled_; }
   void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
+  // Returns true if capture is enabled for the current node.
+  bool GraphCaptureEnabled() {
+    bool isGraphCapture = false;
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+      switch (GetType()) {
+        case hipGraphNodeTypeKernel:
+          isGraphCapture = true;
+          break;
+        default:
+          break;
+      }
+    }
+    return isGraphCapture;
+  }
 };
 
 struct Graph {
@@ -835,16 +864,9 @@ class GraphKernelNode : public GraphNode {
   hipKernelNodeAttrValue kernelAttr_;  //!< Kernel node attributes
   unsigned int kernelAttrInUse_;       //!< Kernel attributes in use
   ihipExtKernelEvents kernelEvents_;   //!< Events for Ext launch kernel
-  size_t alignedKernArgSize_;          //!< Aligned size required for kernel args
-  size_t kernargSegmentByteSize_;      //!< Kernel arg segment byte size
-  size_t kernargSegmentAlignment_;     //!< Kernel arg segment alignment
   bool hasHiddenHeap_;                 //!< Kernel has hidden heap(device side allocation)
 
-
  public:
-  size_t GetKerArgSize() const { return alignedKernArgSize_; }
-  size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
-  size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
   bool HasHiddenHeap() const { return hasHiddenHeap_; }
   void EnqueueCommands(hipStream_t stream) override {
     // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
@@ -888,21 +910,6 @@ class GraphKernelNode : public GraphNode {
     out << "];";
     }
 
-  void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
-    hipError_t status = CreateCommand(capture_stream);
-    for (auto& command : commands_) {
-      reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
-          true, GetAqlPacket(), kernArgOffset);
-
-      // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
-      // The packet is stored in gpuPacket_ and submitted during graph launch.
-      command->submit(*(command->queue())->vdev());
-      // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
-      SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
-      command->release();
-    }
-  }
-
   virtual std::string GetLabel(hipGraphDebugDotFlags flag) override {
     hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
     hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index fa822ac3c1..3b10056c8a 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -1497,6 +1497,9 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
       }
     }
   }
+  if (command.getCapturingState()) {
+    currCmd_ = &command;
+  }
 }
 
 // ================================================================================================
@@ -1514,6 +1517,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
   if (AMD_DIRECT_DISPATCH) {
     assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
   }
+  currCmd_ = nullptr;
 }
 
 // ================================================================================================
@@ -3018,7 +3022,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
 
   amd::Memory* const* memories =
       reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-  bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
+  bool isGraphCapture = currCmd_ != nullptr && currCmd_->getCapturingState();
   for (int j = 0; j < iteration; j++) {
     // Reset global size for dimension dim if split is needed
     if (dim != -1) {
@@ -3238,7 +3242,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
       // Allocate buffer to hold kernel arguments
       if (isGraphCapture) {
-        argBuffer = vcmd->getKernArgOffset();
+        argBuffer = currCmd_->getKernArgOffset();
+        currCmd_->SetKernelName(gpuKernel.name());
       } else {
 
         argBuffer = reinterpret_cast<address>(
@@ -3324,18 +3329,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
     }
 
-    if (vcmd == nullptr) {
+    if (isGraphCapture) {
       // Dispatch the packet
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
-                             (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION)) {
+                        (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
+                        GPU_FLUSH_ON_EXECUTION, currCmd_->getCapturingState(),
+                        currCmd_->getAqlPacket())) {
         return false;
       }
     } else {
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                              (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION, vcmd->getCapturingState(),
-                             vcmd->getAqlPacket())) {
+                             GPU_FLUSH_ON_EXECUTION)) {
         return false;
       }
     }
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 053c9751b7..8ae66fde31 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -575,5 +575,7 @@ class VirtualGPU : public device::VirtualDevice {
   std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask
 
   using KernelArgImpl = device::Settings::KernelArgImpl;
+
+  amd::Command* currCmd_ = nullptr;  //!< Current command under capture
 };
 }
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index fa8c315b57..af70bb28ab 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -256,6 +256,10 @@ class Command : public Event {
   std::vector<void*> data_;
   const Event* waitingEvent_;  //!< Waiting event associated with the marker
 
+  bool capturing_ = false;           //!< Flag to enable/disable graph gpu packet capture
+  uint8_t* gpuPacket_ = nullptr;     //!< GPU packet to capture, when graph capturing is enabled
+  address kernArgOffset_ = nullptr;  //!< KernelArg buffer to used when graph capturing is enabled
+  std::string* capturedKernelName_ = nullptr;  //!< Kenrnel under capture
  protected:
   bool cpu_wait_ = false;         //!< If true, then the command was issued for CPU/GPU sync
 
@@ -292,6 +296,31 @@ class Command : public Event {
   }
 
  public:
+  //! Returns AQL buffer state
+  bool getCapturingState() const { return capturing_; }
+
+  //! Sets AQL capture state, aql packet to capture and where to copy kernArgs
+  void setCapturingState(bool state, uint8_t* packet, address kernArgOffset,
+                         std::string* capturedKernelName) {
+    capturing_ = state;
+    gpuPacket_ = packet;
+    kernArgOffset_ = kernArgOffset;
+    capturedKernelName_ = capturedKernelName;
+  }
+
+  //! Updates kernel name with the captured kernel name
+  void SetKernelName(const std::string& kernelName) {
+    if (capturedKernelName_ != nullptr) {
+      *capturedKernelName_ = kernelName;
+    }
+  }
+
+  //! returns the graph executable object command belongs to.
+  const uint8_t* getAqlPacket() const { return gpuPacket_; }
+
+  //! returns the graph executable object command belongs to.
+  const address getKernArgOffset() const { return kernArgOffset_; }
+
   //! Overload new/delete for fast commands allocation/destruction
   void* operator new(size_t size);
   void operator delete(void* ptr);
@@ -1075,10 +1104,6 @@ class NDRangeKernelCommand : public Command {
   uint32_t firstDevice_;    //!< Device index of the first device in the gridc
   uint32_t numWorkgroups_;  //!< Total number of workgroups in the current launch
 
-  bool capturing_ = false;           //!< Flag to enable/disable graph gpu packet capture
-  uint8_t* gpuPacket_ = nullptr;     //!< GPU packet to capture, when graph capturing is enabled
-  address kernArgOffset_ = nullptr;  //!< KernelArg buffer to used when graph capturing is enabled
-
  public:
   enum {
     CooperativeGroups = 0x01,
@@ -1086,22 +1111,6 @@ class NDRangeKernelCommand : public Command {
     AnyOrderLaunch = 0x04,
   };
 
-  //! Returns AQL buffer state
-  bool getCapturingState() const { return capturing_; }
-
-  //! Sets AQL capture state, aql packet to capture and where to copy kernArgs
-  void setCapturingState(bool state, uint8_t* packet, address kernArgOffset) {
-    capturing_ = state;
-    gpuPacket_ = packet;
-    kernArgOffset_ = kernArgOffset;
-  }
-
-  //! returns the graph executable object command belongs to.
-  const uint8_t* getAqlPacket() const { return gpuPacket_; }
-
-  //! returns the graph executable object command belongs to.
-  const address getKernArgOffset() const { return kernArgOffset_; }
-
   //! Construct an ExecuteKernel command
   NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
                        const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,