diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp
index bf1ee627f9..3d44b10b67 100644
--- a/hipamd/src/hip_graph_internal.cpp
+++ b/hipamd/src/hip_graph_internal.cpp
@@ -362,9 +362,9 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
   // GPU packet capture is enabled for kernel nodes. Calculate the kernel
   // arg size required for all graph kernel nodes to allocate
   for (const auto& list : parallelLists) {
-    for (auto& node : list) {
-      if (node->GetType() == hipGraphNodeTypeKernel) {
-        kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
+    for (hip::GraphNode* node : list) {
+      if (node->GraphCaptureEnabled()) {
+        kernArgSizeForGraph += node->GetKerArgSize();
       } else if (node->GetType() == hipGraphNodeTypeGraph) {
         auto& childParallelLists =
             reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
@@ -388,18 +388,19 @@ hipError_t AllocKernelArgForGraphNode(std::vector<hip::Node>& topoOrder,
         graphExec->SetHiddenHeap();
         initialized = true;
       }
-      auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
-      // From the kernel pool allocate the kern arg size required for the current kernel node.
+    }
+    if (node->GraphCaptureEnabled()) {
+      // From the kernel pool allocate the kern arg size required for the current node.
       address kernArgOffset = nullptr;
-      if (kernelNode->GetKernargSegmentByteSize()) {
+      if (node->GetKernargSegmentByteSize()) {
         kernArgOffset = graphExec->kernArgManager_->AllocKernArg(
-            kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment());
+            node->GetKernargSegmentByteSize(), node->GetKernargSegmentAlignment());
         if (kernArgOffset == nullptr) {
           return hipErrorMemoryAllocation;
         }
       }
       // Form GPU packet capture for the kernel node.
-      kernelNode->CaptureAndFormPacket(capture_stream, kernArgOffset);
+      node->CaptureAndFormPacket(capture_stream, kernArgOffset);
     } else if (node->GetType() == hipGraphNodeTypeGraph) {
       auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
       auto& childParallelLists = childNode->GetParallelLists();
@@ -551,7 +552,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
     accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
   }
   for (int i = 0; i < topoOrder.size(); i++) {
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
+    if (topoOrder[i]->GraphCaptureEnabled()) {
       if (topoOrder[i]->GetEnabled()) {
         hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
                                               topoOrder[i]->GetKernelName(),
diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp
index 20499a202b..171bfb24da 100644
--- a/hipamd/src/hip_graph_internal.hpp
+++ b/hipamd/src/hip_graph_internal.hpp
@@ -186,6 +186,9 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   unsigned int isEnabled_;
   uint8_t gpuPacket_[64];  //!< GPU Packet to enqueue during graph launch
   std::string capturedKernelName_;
+  size_t alignedKernArgSize_ = 256;       //!< Aligned size required for kernel args
+  size_t kernargSegmentByteSize_ = 256;   //!< Kernel arg segment byte size
+  size_t kernargSegmentAlignment_ = 256;  //!< Kernel arg segment alignment
 
  public:
   GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
@@ -237,7 +240,19 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   uint8_t* GetAqlPacket() { return gpuPacket_; }
   void SetKernelName(const std::string& kernelName) { capturedKernelName_ = kernelName; }
   const std::string& GetKernelName() const { return capturedKernelName_; }
-
+  size_t GetKerArgSize() const { return alignedKernArgSize_; }
+  size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
+  size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
+  void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
+    hipError_t status = CreateCommand(capture_stream);
+    for (auto& command : commands_) {
+      command->setCapturingState(true, GetAqlPacket(), kernArgOffset, &capturedKernelName_);
+      // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
+      // The packet is stored in gpuPacket_ and submitted during graph launch.
+      command->submit(*(command->queue())->vdev());
+      command->release();
+    }
+  }
   hip::Stream* GetQueue() const { return stream_; }
 
   virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
@@ -380,6 +395,20 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   }
   unsigned int GetEnabled() const { return isEnabled_; }
   void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
+  // Returns true if capture is enabled for the current node.
+  bool GraphCaptureEnabled() {
+    bool isGraphCapture = false;
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+      switch (GetType()) {
+        case hipGraphNodeTypeKernel:
+          isGraphCapture = true;
+          break;
+        default:
+          break;
+      }
+    }
+    return isGraphCapture;
+  }
 };
 
 struct Graph {
@@ -835,16 +864,9 @@ class GraphKernelNode : public GraphNode {
   hipKernelNodeAttrValue kernelAttr_;  //!< Kernel node attributes
   unsigned int kernelAttrInUse_;       //!< Kernel attributes in use
   ihipExtKernelEvents kernelEvents_;   //!< Events for Ext launch kernel
-  size_t alignedKernArgSize_;          //!< Aligned size required for kernel args
-  size_t kernargSegmentByteSize_;      //!< Kernel arg segment byte size
-  size_t kernargSegmentAlignment_;     //!< Kernel arg segment alignment
   bool hasHiddenHeap_;                 //!< Kernel has hidden heap(device side allocation)
 
-
  public:
-  size_t GetKerArgSize() const { return alignedKernArgSize_; }
-  size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
-  size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
   bool HasHiddenHeap() const { return hasHiddenHeap_; }
   void EnqueueCommands(hipStream_t stream) override {
     // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
@@ -888,21 +910,6 @@ class GraphKernelNode : public GraphNode {
     out << "];";
     }
 
-  void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
-    hipError_t status = CreateCommand(capture_stream);
-    for (auto& command : commands_) {
-      reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
-          true, GetAqlPacket(), kernArgOffset);
-
-      // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
-      // The packet is stored in gpuPacket_ and submitted during graph launch.
-      command->submit(*(command->queue())->vdev());
-      // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
-      SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
-      command->release();
-    }
-  }
-
   virtual std::string GetLabel(hipGraphDebugDotFlags flag) override {
     hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
     hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index fa822ac3c1..3b10056c8a 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -1497,6 +1497,9 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
       }
     }
   }
+  if (command.getCapturingState()) {
+    currCmd_ = &command;
+  }
 }
 
 // ================================================================================================
@@ -1514,6 +1517,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
   if (AMD_DIRECT_DISPATCH) {
     assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
   }
+  currCmd_ = nullptr;
 }
 
 // ================================================================================================
@@ -3018,7 +3022,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
 
   amd::Memory* const* memories =
       reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-  bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
+  bool isGraphCapture = currCmd_ != nullptr && currCmd_->getCapturingState();
   for (int j = 0; j < iteration; j++) {
     // Reset global size for dimension dim if split is needed
     if (dim != -1) {
@@ -3238,7 +3242,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
       // Allocate buffer to hold kernel arguments
       if (isGraphCapture) {
-        argBuffer = vcmd->getKernArgOffset();
+        argBuffer = currCmd_->getKernArgOffset();
+        currCmd_->SetKernelName(gpuKernel.name());
       } else {
 
         argBuffer = reinterpret_cast<address>(
@@ -3324,18 +3329,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
     }
 
-    if (vcmd == nullptr) {
+    if (isGraphCapture) {
       // Dispatch the packet
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
-                             (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION)) {
+                        (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
+                        GPU_FLUSH_ON_EXECUTION, currCmd_->getCapturingState(),
+                        currCmd_->getAqlPacket())) {
         return false;
       }
     } else {
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                              (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION, vcmd->getCapturingState(),
-                             vcmd->getAqlPacket())) {
+                             GPU_FLUSH_ON_EXECUTION)) {
         return false;
       }
     }
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 053c9751b7..8ae66fde31 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -575,5 +575,7 @@ class VirtualGPU : public device::VirtualDevice {
   std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask
 
   using KernelArgImpl = device::Settings::KernelArgImpl;
+
+  amd::Command* currCmd_ = nullptr;  //!< Current command under capture
 };
 }
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index fa8c315b57..af70bb28ab 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -256,6 +256,10 @@ class Command : public Event {
   std::vector<void*> data_;
   const Event* waitingEvent_;  //!< Waiting event associated with the marker
 
+  bool capturing_ = false;           //!< Flag to enable/disable graph gpu packet capture
+  uint8_t* gpuPacket_ = nullptr;     //!< GPU packet to capture, when graph capturing is enabled
+  address kernArgOffset_ = nullptr;  //!< KernelArg buffer to used when graph capturing is enabled
+  std::string* capturedKernelName_ = nullptr;  //!< Kenrnel under capture
  protected:
   bool cpu_wait_ = false;         //!< If true, then the command was issued for CPU/GPU sync
 
@@ -292,6 +296,31 @@ class Command : public Event {
   }
 
  public:
+  //! Returns AQL buffer state
+  bool getCapturingState() const { return capturing_; }
+
+  //! Sets AQL capture state, aql packet to capture and where to copy kernArgs
+  void setCapturingState(bool state, uint8_t* packet, address kernArgOffset,
+                         std::string* capturedKernelName) {
+    capturing_ = state;
+    gpuPacket_ = packet;
+    kernArgOffset_ = kernArgOffset;
+    capturedKernelName_ = capturedKernelName;
+  }
+
+  //! Updates kernel name with the captured kernel name
+  void SetKernelName(const std::string& kernelName) {
+    if (capturedKernelName_ != nullptr) {
+      *capturedKernelName_ = kernelName;
+    }
+  }
+
+  //! returns the graph executable object command belongs to.
+  const uint8_t* getAqlPacket() const { return gpuPacket_; }
+
+  //! returns the graph executable object command belongs to.
+  const address getKernArgOffset() const { return kernArgOffset_; }
+
   //! Overload new/delete for fast commands allocation/destruction
   void* operator new(size_t size);
   void operator delete(void* ptr);
@@ -1075,10 +1104,6 @@ class NDRangeKernelCommand : public Command {
   uint32_t firstDevice_;    //!< Device index of the first device in the gridc
   uint32_t numWorkgroups_;  //!< Total number of workgroups in the current launch
 
-  bool capturing_ = false;           //!< Flag to enable/disable graph gpu packet capture
-  uint8_t* gpuPacket_ = nullptr;     //!< GPU packet to capture, when graph capturing is enabled
-  address kernArgOffset_ = nullptr;  //!< KernelArg buffer to used when graph capturing is enabled
-
  public:
   enum {
     CooperativeGroups = 0x01,
@@ -1086,22 +1111,6 @@ class NDRangeKernelCommand : public Command {
     AnyOrderLaunch = 0x04,
   };
 
-  //! Returns AQL buffer state
-  bool getCapturingState() const { return capturing_; }
-
-  //! Sets AQL capture state, aql packet to capture and where to copy kernArgs
-  void setCapturingState(bool state, uint8_t* packet, address kernArgOffset) {
-    capturing_ = state;
-    gpuPacket_ = packet;
-    kernArgOffset_ = kernArgOffset;
-  }
-
-  //! returns the graph executable object command belongs to.
-  const uint8_t* getAqlPacket() const { return gpuPacket_; }
-
-  //! returns the graph executable object command belongs to.
-  const address getKernArgOffset() const { return kernArgOffset_; }
-
   //! Construct an ExecuteKernel command
   NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
                        const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,