SWDEV-468424 - hipgraph capture memset node

Capture AQL packets during GraphInstantiation and enqueue AQL packets during graph launch. Added support to capture single graph memset node. Capture support for memset node is currently disabled. Memset capture will be enabled when capture for multiple packets are supported.. Change-Id: I14dfbc41731025cc3a548a730558915def3fa384 [ROCm/clr commit: 346da4bb40]
2024-06-18 11:12:37 +00:00
@@ -362,9 +362,9 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
  // GPU packet capture is enabled for kernel nodes. Calculate the kernel
  // arg size required for all graph kernel nodes to allocate
  for (const auto& list : parallelLists) {
-    for (auto& node : list) {
-      if (node->GetType() == hipGraphNodeTypeKernel) {
-        kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
+    for (hip::GraphNode* node : list) {
+      if (node->GraphCaptureEnabled()) {
+        kernArgSizeForGraph += node->GetKerArgSize();
      } else if (node->GetType() == hipGraphNodeTypeGraph) {
        auto& childParallelLists =
            reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
@@ -388,18 +388,19 @@ hipError_t AllocKernelArgForGraphNode(std::vector<hip::Node>& topoOrder,
        graphExec->SetHiddenHeap();
        initialized = true;
      }
-      auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
-      // From the kernel pool allocate the kern arg size required for the current kernel node.
+    }
+    if (node->GraphCaptureEnabled()) {
+      // From the kernel pool allocate the kern arg size required for the current node.
      address kernArgOffset = nullptr;
-      if (kernelNode->GetKernargSegmentByteSize()) {
+      if (node->GetKernargSegmentByteSize()) {
        kernArgOffset = graphExec->kernArgManager_->AllocKernArg(
-            kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment());
+            node->GetKernargSegmentByteSize(), node->GetKernargSegmentAlignment());
        if (kernArgOffset == nullptr) {
          return hipErrorMemoryAllocation;
        }
      }
      // Form GPU packet capture for the kernel node.
-      kernelNode->CaptureAndFormPacket(capture_stream, kernArgOffset);
+      node->CaptureAndFormPacket(capture_stream, kernArgOffset);
    } else if (node->GetType() == hipGraphNodeTypeGraph) {
      auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
      auto& childParallelLists = childNode->GetParallelLists();
@@ -551,7 +552,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
    accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
  }
  for (int i = 0; i < topoOrder.size(); i++) {
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
+    if (topoOrder[i]->GraphCaptureEnabled()) {
      if (topoOrder[i]->GetEnabled()) {
        hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
                                              topoOrder[i]->GetKernelName(),
@@ -186,6 +186,9 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
  unsigned int isEnabled_;
  uint8_t gpuPacket_[64];  //!< GPU Packet to enqueue during graph launch
  std::string capturedKernelName_;
+  size_t alignedKernArgSize_ = 256;       //!< Aligned size required for kernel args
+  size_t kernargSegmentByteSize_ = 256;   //!< Kernel arg segment byte size
+  size_t kernargSegmentAlignment_ = 256;  //!< Kernel arg segment alignment

 public:
  GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
@@ -237,7 +240,19 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
  uint8_t* GetAqlPacket() { return gpuPacket_; }
  void SetKernelName(const std::string& kernelName) { capturedKernelName_ = kernelName; }
  const std::string& GetKernelName() const { return capturedKernelName_; }
-
+  size_t GetKerArgSize() const { return alignedKernArgSize_; }
+  size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
+  size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
+  void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
+    hipError_t status = CreateCommand(capture_stream);
+    for (auto& command : commands_) {
+      command->setCapturingState(true, GetAqlPacket(), kernArgOffset, &capturedKernelName_);
+      // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
+      // The packet is stored in gpuPacket_ and submitted during graph launch.
+      command->submit(*(command->queue())->vdev());
+      command->release();
+    }
+  }
  hip::Stream* GetQueue() const { return stream_; }

  virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
@@ -380,6 +395,20 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
  }
  unsigned int GetEnabled() const { return isEnabled_; }
  void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
+  // Returns true if capture is enabled for the current node.
+  bool GraphCaptureEnabled() {
+    bool isGraphCapture = false;
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+      switch (GetType()) {
+        case hipGraphNodeTypeKernel:
+          isGraphCapture = true;
+          break;
+        default:
+          break;
+      }
+    }
+    return isGraphCapture;
+  }
 };

 struct Graph {
@@ -835,16 +864,9 @@ class GraphKernelNode : public GraphNode {
  hipKernelNodeAttrValue kernelAttr_;  //!< Kernel node attributes
  unsigned int kernelAttrInUse_;       //!< Kernel attributes in use
  ihipExtKernelEvents kernelEvents_;   //!< Events for Ext launch kernel
-  size_t alignedKernArgSize_;          //!< Aligned size required for kernel args
-  size_t kernargSegmentByteSize_;      //!< Kernel arg segment byte size
-  size_t kernargSegmentAlignment_;     //!< Kernel arg segment alignment
  bool hasHiddenHeap_;                 //!< Kernel has hidden heap(device side allocation)

-
 public:
-  size_t GetKerArgSize() const { return alignedKernArgSize_; }
-  size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
-  size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
  bool HasHiddenHeap() const { return hasHiddenHeap_; }
  void EnqueueCommands(hipStream_t stream) override {
    // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
@@ -888,21 +910,6 @@ class GraphKernelNode : public GraphNode {
    out << "];";
    }

-  void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
-    hipError_t status = CreateCommand(capture_stream);
-    for (auto& command : commands_) {
-      reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
-          true, GetAqlPacket(), kernArgOffset);
-
-      // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
-      // The packet is stored in gpuPacket_ and submitted during graph launch.
-      command->submit(*(command->queue())->vdev());
-      // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
-      SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
-      command->release();
-    }
-  }
-
  virtual std::string GetLabel(hipGraphDebugDotFlags flag) override {
    hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
    hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
@@ -1497,6 +1497,9 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
      }
    }
  }
+  if (command.getCapturingState()) {
+    currCmd_ = &command;
+  }
 }

 // ================================================================================================
@@ -1514,6 +1517,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
  if (AMD_DIRECT_DISPATCH) {
    assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
  }
+  currCmd_ = nullptr;
 }

 // ================================================================================================
@@ -3018,7 +3022,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,

  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-  bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
+  bool isGraphCapture = currCmd_ != nullptr && currCmd_->getCapturingState();
  for (int j = 0; j < iteration; j++) {
    // Reset global size for dimension dim if split is needed
    if (dim != -1) {
@@ -3238,7 +3242,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
      // Allocate buffer to hold kernel arguments
      if (isGraphCapture) {
-        argBuffer = vcmd->getKernArgOffset();
+        argBuffer = currCmd_->getKernArgOffset();
+        currCmd_->SetKernelName(gpuKernel.name());
      } else {

        argBuffer = reinterpret_cast<address>(
@@ -3324,18 +3329,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
    }

-    if (vcmd == nullptr) {
+    if (isGraphCapture) {
      // Dispatch the packet
      if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
-                             (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION)) {
+                        (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
+                        GPU_FLUSH_ON_EXECUTION, currCmd_->getCapturingState(),
+                        currCmd_->getAqlPacket())) {
        return false;
      }
    } else {
      if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                             (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION, vcmd->getCapturingState(),
-                             vcmd->getAqlPacket())) {
+                             GPU_FLUSH_ON_EXECUTION)) {
        return false;
      }
    }
@@ -575,5 +575,7 @@ class VirtualGPU : public device::VirtualDevice {
  std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask

  using KernelArgImpl = device::Settings::KernelArgImpl;
+
+  amd::Command* currCmd_ = nullptr;  //!< Current command under capture
 };
 }
@@ -256,6 +256,10 @@ class Command : public Event {
  std::vector<void*> data_;
  const Event* waitingEvent_;  //!< Waiting event associated with the marker

+  bool capturing_ = false;           //!< Flag to enable/disable graph gpu packet capture
+  uint8_t* gpuPacket_ = nullptr;     //!< GPU packet to capture, when graph capturing is enabled
+  address kernArgOffset_ = nullptr;  //!< KernelArg buffer to used when graph capturing is enabled
+  std::string* capturedKernelName_ = nullptr;  //!< Kenrnel under capture
 protected:
  bool cpu_wait_ = false;         //!< If true, then the command was issued for CPU/GPU sync

@@ -292,6 +296,31 @@ class Command : public Event {
  }

 public:
+  //! Returns AQL buffer state
+  bool getCapturingState() const { return capturing_; }
+
+  //! Sets AQL capture state, aql packet to capture and where to copy kernArgs
+  void setCapturingState(bool state, uint8_t* packet, address kernArgOffset,
+                         std::string* capturedKernelName) {
+    capturing_ = state;
+    gpuPacket_ = packet;
+    kernArgOffset_ = kernArgOffset;
+    capturedKernelName_ = capturedKernelName;
+  }
+
+  //! Updates kernel name with the captured kernel name
+  void SetKernelName(const std::string& kernelName) {
+    if (capturedKernelName_ != nullptr) {
+      *capturedKernelName_ = kernelName;
+    }
+  }
+
+  //! returns the graph executable object command belongs to.
+  const uint8_t* getAqlPacket() const { return gpuPacket_; }
+
+  //! returns the graph executable object command belongs to.
+  const address getKernArgOffset() const { return kernArgOffset_; }
+
  //! Overload new/delete for fast commands allocation/destruction
  void* operator new(size_t size);
  void operator delete(void* ptr);
@@ -1075,10 +1104,6 @@ class NDRangeKernelCommand : public Command {
  uint32_t firstDevice_;    //!< Device index of the first device in the gridc
  uint32_t numWorkgroups_;  //!< Total number of workgroups in the current launch

-  bool capturing_ = false;           //!< Flag to enable/disable graph gpu packet capture
-  uint8_t* gpuPacket_ = nullptr;     //!< GPU packet to capture, when graph capturing is enabled
-  address kernArgOffset_ = nullptr;  //!< KernelArg buffer to used when graph capturing is enabled
-
 public:
  enum {
    CooperativeGroups = 0x01,
@@ -1086,22 +1111,6 @@ class NDRangeKernelCommand : public Command {
    AnyOrderLaunch = 0x04,
  };

-  //! Returns AQL buffer state
-  bool getCapturingState() const { return capturing_; }
-
-  //! Sets AQL capture state, aql packet to capture and where to copy kernArgs
-  void setCapturingState(bool state, uint8_t* packet, address kernArgOffset) {
-    capturing_ = state;
-    gpuPacket_ = packet;
-    kernArgOffset_ = kernArgOffset;
-  }
-
-  //! returns the graph executable object command belongs to.
-  const uint8_t* getAqlPacket() const { return gpuPacket_; }
-
-  //! returns the graph executable object command belongs to.
-  const address getKernArgOffset() const { return kernArgOffset_; }
-
  //! Construct an ExecuteKernel command
  NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
                       const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,