clr: Use graph segment scheduling to process HIP Graphs (#1372)

* clr: Use graph segment scheduling to process HIP Graphs * Add a broader path to use capture packet capture for all topologies * Refactor code * Use DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING to toggle new vs classic path, Enabled by default * clr: Few fixes and improvements * clr: Detect complex graphs to take classic path * Use DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING=2 to force segment scheduling path * clr: Fix a cornercase stack corruption * clr: Track commands of segments instead of snapshots * clr: Fix Batch dispatch logic * Track fence_dirty_ flag for command of other streams * Dependency resolution markers can now accomodate dirty fence on cross streams --------- Co-authored-by: Ioannis Assiouras <Ioannis.Assiouras@amd.com> Co-authored-by: Godavarthy Surya, Anusha <agodavar@amd.com>
2025-12-01 12:49:26 -08:00
@@ -1425,9 +1425,9 @@ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraph
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
-                 ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
  }
  HIP_RETURN(status);
 }
@@ -1523,12 +1523,14 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
    return hipErrorOutOfMemory;
  }
  graph->clone(*pGraphExec, true);
-  (*pGraphExec)->ScheduleNodes();
-  if (false == (*pGraphExec)->TopologicalOrder()) {
+
+  hipError_t scheduleStatus = (*pGraphExec)->ScheduleNodes();
+  if (scheduleStatus != hipSuccess) {
    delete *pGraphExec;
-    return hipErrorInvalidValue;
+    *pGraphExec = nullptr;
+    return scheduleStatus;
  }
-  graph->SetGraphInstantiated(true);
+
  if (DEBUG_HIP_GRAPH_DOT_PRINT) {
    static int i = 1;
    std::string filename =
@@ -1538,7 +1540,10 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
      LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str());
    }
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+
+  graph->SetGraphInstantiated(true);
+
+  if ((*pGraphExec)->IsSegmentSchedulingEnabled()) {
    (*pGraphExec)->SetKernelArgManager(new hip::GraphKernelArgManager());
  }
  return (*pGraphExec)->Init();
@@ -1555,7 +1560,7 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
  if (status == hipSuccess) {
    *pGraphExec = reinterpret_cast<hipGraphExec_t>(ge);
  }
-  HIP_RETURN(status);
+  HIP_RETURN(status, ReturnPtrValue(pGraphExec));
 }

 hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
@@ -1574,7 +1579,7 @@ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t g
  hip::GraphExec* ge;
  hipError_t status = ihipGraphInstantiate(&ge, reinterpret_cast<hip::Graph*>(graph), flags);
  *pGraphExec = reinterpret_cast<hipGraphExec_t>(ge);
-  HIP_RETURN(status);
+  HIP_RETURN(status, ReturnPtrValue(pGraphExec));
 }

 hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
@@ -1609,7 +1614,7 @@ hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t
    HIP_RETURN(status);
  }

-  HIP_RETURN(hipSuccess);
+  HIP_RETURN(hipSuccess, ReturnPtrValue(pGraphExec));
 }

 hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) {
@@ -1820,9 +1825,9 @@ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
-                 ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
  }
  HIP_RETURN(status);
 }
@@ -1871,9 +1876,9 @@ hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
-                 ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
  }
  HIP_RETURN(status);
 }
@@ -1931,9 +1936,9 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
-                 ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
  }
  HIP_RETURN(status);
 }
@@ -2008,13 +2013,18 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
  if (status != hipSuccess) {
    return status;
  }
-  if (reinterpret_cast<hip::ChildGraphNode*>(clonedNode)->GetGraphCaptureStatus()) {
+
+  hip::ChildGraphNode* childNode = reinterpret_cast<hip::ChildGraphNode*>(clonedNode);
+
+  // After SetParams updates node parameters in-place, we need to update the cached AQL packets
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled() || childNode->GetGraphCaptureStatus()) {
    std::vector<hip::GraphNode*> childGraphNodes;
-    reinterpret_cast<hip::ChildGraphNode*>(clonedNode)->TopologicalOrder(childGraphNodes);
+    childNode->TopologicalOrder(childGraphNodes);
    for (std::vector<hip::GraphNode*>::size_type i = 0; i != childGraphNodes.size(); i++) {
      if (childGraphNodes[i]->GraphCaptureEnabled()) {
-        status = reinterpret_cast<hip::ChildGraphNode*>(clonedNode)
-                     ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
+        status =
+            childNode->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
        if (status != hipSuccess) {
          return status;
        }
@@ -2414,9 +2424,9 @@ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec,
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
-                 ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
  }
  HIP_RETURN(status);
 }
@@ -2497,9 +2507,9 @@ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hi
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
-                 ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
  }
  HIP_RETURN(status);
 }
@@ -2734,10 +2744,11 @@ hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph,
          *updateResult_out = hipGraphExecUpdateErrorNotSupported;
        }
        HIP_RETURN(hipErrorGraphExecUpdateFailure);
-      } else if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && newGraphNodes[i]->GraphCaptureEnabled()) {
-        status =
-            reinterpret_cast<hip::GraphExec*>(hGraphExec)
-                ->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(oldGraphExecNodes[i]));
+      } else {
+        auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+        if (graphExec->IsSegmentSchedulingEnabled() && newGraphNodes[i]->GraphCaptureEnabled()) {
+          status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(oldGraphExecNodes[i]));
+        }
      }
    } else {
      *hErrorNode_out = reinterpret_cast<hipGraphNode_t>(newGraphNodes[i]);
@@ -3091,12 +3102,16 @@ hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNod
    HIP_RETURN(hipErrorInvalidValue);
  }
  clonedNode->SetEnabled(isEnabled);
-  // Update packet batches when node is enabled/disabled
-  hipError_t status = graphExec->UpdatePacketBatchesForNodeEnableDisable(clonedNode, isEnabled != 0);
-  if (status != hipSuccess) {
-    HIP_RETURN(status);
+
+  hipError_t status = hipSuccess;
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    // Update packet batches when node is enabled/disabled
+    status = graphExec->UpdatePacketBatchesForNodeEnableDisable(clonedNode, isEnabled != 0);
+    if (status != hipSuccess) {
+      HIP_RETURN(status);
+    }
  }
-  HIP_RETURN(hipSuccess);
+  HIP_RETURN(status);
 }

 hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
@@ -3449,8 +3464,9 @@ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGrap
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(hGraphExec)->UpdateAQLPacket(clonedNode);
+  auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
+  if (graphExec->IsSegmentSchedulingEnabled()) {
+    status = graphExec->UpdateAQLPacket(clonedNode);
  }
  HIP_RETURN(status);
 }
@@ -3572,8 +3588,9 @@ hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t no
    return status;
  }

-  if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-    status = reinterpret_cast<hip::GraphExec*>(graphExec)->UpdateAQLPacket(clonedNode);
+  auto graphExecPtr = reinterpret_cast<hip::GraphExec*>(graphExec);
+  if (graphExecPtr->IsSegmentSchedulingEnabled()) {
+    status = graphExecPtr->UpdateAQLPacket(clonedNode);
  }
  return status;
 }
@@ -320,6 +320,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
  const std::vector<Node>& GetDependencies() const { return dependencies_; }
  /// Update graph node dependecies
  void SetDependencies(std::vector<Node>& dependencies) {
+    dependencies_.clear();
    for (auto entry : dependencies) {
      dependencies_.push_back(entry);
    }
@@ -366,6 +367,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
  const std::vector<Node>& GetEdges() const { return edges_; }
  /// Updates graph node children
  void SetEdges(std::vector<Node>& edges) {
+    edges_.clear();
    for (auto entry : edges) {
      edges_.push_back(entry);
    }
@@ -425,19 +427,10 @@ class GraphNode : public hipGraphNodeDOTAttribute {
  }
  unsigned int GetEnabled() const { return isEnabled_; }
  void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
-  // Returns true if capture is enabled for the current node.
+
+  // Base implementation returns false; specific node types should override.
  virtual bool GraphCaptureEnabled() {
-    bool isGraphCapture = false;
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-      switch (GetType()) {
-        case hipGraphNodeTypeMemset:
-          isGraphCapture = true;
-          break;
-        default:
-          break;
-      }
-    }
-    return isGraphCapture;
+    return false;
  }
  virtual void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override {
    out << "[";
@@ -454,6 +447,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
    out << GetLabel(flag);
    if (DEBUG_HIP_GRAPH_DOT_PRINT) {
      out << "\nStreamId:" << stream_id_;
+      out << "\nSegmentId:" << segment_id_;
      out << "\nSignalIsRequired: " << ((signal_is_required_) ? "true" : "false");
      out << "\nDeviceId:" << dev_id_;
    }
@@ -479,6 +473,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
  size_t inDegree_;         //!< count of in coming edges (@todo: remove, it's dependencies_.size())
  size_t outDegree_;        //!< count of outgoing edges (@todo: remove, it's edges_.size())
  int32_t stream_id_ = -1;  //! Stream ID on which this node will be executed
+  int32_t segment_id_ = -1;  //! Segment ID on which this node will be executed
  int32_t launch_id_ = -1;  //! Launch ID of this node in the entire graph execution sequence
  static int nextID;
  Graph* parentGraph_;
@@ -556,6 +551,8 @@ class Graph {
    graphSet_.insert(this);
    mem_pool_ = device->GetGraphMemoryPool();
    graphInstantiated_ = false;
+    // Initialize per-graph segment scheduling flag from global env var
+    use_segment_scheduling_ = DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING;
    roots_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
    leafs_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
    wait_order_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
@@ -568,7 +565,7 @@ class Graph {
      }
    }
  }
-  ~Graph() {
+  virtual ~Graph() {
    for (auto node : vertices_) {
      delete node;
    }
@@ -639,6 +636,8 @@ class Graph {
  const std::vector<Node>& GetTopoOrder() const { return topoOrder_; }
  /// returns all the edges in the graph
  std::vector<std::pair<Node, Node>> GetEdges() const;
+  /// Returns whether segment scheduling is enabled for this graph
+  bool IsSegmentSchedulingEnabled() const { return use_segment_scheduling_; }
  // returns the original graph ptr if cloned
  const Graph* getOriginalGraph() const { return pOriginalGraph_; }
  // Add user obj resource to graph
@@ -679,7 +678,43 @@ class Graph {
  );

  //! Schedules all nodes in the graph into different streams
-  void ScheduleNodes();
+  hipError_t ScheduleNodes();
+
+  // Hierarchical path structure for child graph support
+  struct HierarchicalPath {
+    std::vector<Node> nodes;               //!< Nodes in this path (at this level only)
+    Node child_graph_node = nullptr;       //!< Reference to child graph node if present in path
+    int child_graph_paths_index = -1;      //!< Index into child_graph_paths (-1 if no child)
+    int device_id = -1;                    //!< Device ID for this path
+  };
+
+  // Structure to store execution paths for a graph and its children hierarchically
+  struct GraphExecutionPaths {
+    Graph* graph_ptr = nullptr;           //!< Pointer to the graph this belongs to
+    std::vector<HierarchicalPath> paths;  //!< All execution paths at this level only
+    std::vector<GraphExecutionPaths> child_graph_paths;  //!< Child graph execution paths
+  };
+
+  //! Schedules nodes into batches for optimized execution
+  hipError_t ScheduleNodesIntoBatches();
+
+  //! Find execution paths hierarchically, keeping child graphs separate
+  GraphExecutionPaths FindExecutionPathsHierarchical();
+
+  //! Recursively find all paths from a node with hierarchical child graph handling
+  void FindPathsRecursiveHierarchical(Node node,
+                                      std::vector<Node>& current_path,
+                                      std::unordered_set<unsigned int>& visited,
+                                      GraphExecutionPaths& graph_paths);
+
+  //! Create segments from hierarchical execution paths
+  void CreateSegmentsFromPaths(const GraphExecutionPaths& exec_paths);
+
+  //! Resolve dependencies between segments
+  void ResolveSegmentDependencies();
+
+  //! Calculate dependency levels for segments using topological sort
+  void CalculateSegmentTopoDependencyLevels();

  //! Runs one node on the assigned stream
  bool RunOneNode(Node node);  //!< Node for the execution on GPU
@@ -785,8 +820,35 @@ class Graph {
  //!< during multi-device graph execution scheduling.
  std::unordered_map<int, std::set<int>> streams_dev_ids_;
  int instantiateDeviceId_ = -1;
-  //! Topological order of the graph doesn't include nodes embedded as part of the child graph
+    //! Topological order of the graph doesn't include nodes embedded as part of the child graph
  std::vector<Node> topoOrder_;
+
+  // Segment dependency structures
+  struct Segment {
+    int id = -1;
+    int stream_id = -1;                         // Assigned stream for this segment
+    int dependency_level = -1;                  // Topological level (0 = root, 1 = depends on root, etc.)
+    std::vector<Node> nodes;
+    std::vector<int> segment_ids_dependencies;  // Segments this segment depends on (within same graph)
+    std::vector<int> segment_ids_edges;         // Segments that depend on this segment (within same graph)
+    Node first_node = nullptr;
+    Node last_node = nullptr;
+
+    // Hierarchical child graph information
+    Graph* child_graph_ptr = nullptr;           // Direct pointer to child graph for quick access
+  };
+
+  //! Segment information for batch scheduling
+  std::vector<Segment> segments_;
+  //! Map of node to segment ID
+  std::unordered_map<Node, int> node_to_segment_id_;
+  //! Maximum dependency level in the segment graph
+  int max_dependency_level_ = -1;
+  //!< Map of dependency level to list of segment IDs at that level
+  std::unordered_map<int, std::vector<int>> segments_per_level_;
+
+  std::unordered_map<Node, Node> clonedNodes_;
+
 private:
  friend class GraphExec;
  std::vector<Node> vertices_;
@@ -807,7 +869,10 @@ class Graph {
  hip::MemoryPool* mem_pool_;          //!< Memory pool, associated with this graph
  std::unordered_set<GraphNode*> capturedNodes_;
  bool graphInstantiated_;
-  std::unordered_map<Node, Node> clonedNodes_;
+  //!< Per-graph flag to control segment scheduling
+  //!< Can be disabled per-graph for complex graphs that benefit from classic path
+  bool use_segment_scheduling_;
+
  //! Map of device ID to vector of streams allocated for that device during graph execution.
  //! Each device may require multiple streams to handle parallel execution of graph nodes.
  std::unordered_map<int, std::vector<hip::Stream*>> streams_dev_;
@@ -815,6 +880,17 @@ class Graph {
  //! Map tracking the maximum number of concurrent streams required per device for graph execution.
  //! Key: device ID, Value: maximum number of streams needed for that device
  std::unordered_map<int, int> max_streams_dev_;
+
+  // Batch-based scheduling structures
+  struct Batch {
+    int id = -1;
+    int stream_id = 0;
+    std::vector<Node> nodes;
+    std::vector<int> incoming_stream_ids;
+    Node last_node = nullptr;
+  };
+
+  std::vector<Batch> batches_;
 };

 class GraphExec : public amd::ReferenceCountedObject, public Graph {
@@ -822,6 +898,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
  static std::unordered_set<GraphExec*> graphExecSet_;
  static amd::Monitor graphExecSetLock_;
  static amd::Monitor graphExecStreamCreateLock_;
+  bool graph_dumped_ = false;
  GraphExec(uint64_t flags = 0)
      : ReferenceCountedObject(), Graph(hip::getCurrentDevice()), flags_(flags) {
    amd::ScopedLock lock(graphExecSetLock_);
@@ -832,20 +909,20 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
    for (auto streams : parallel_streams_) {
      for (auto stream : streams.second) {
        if (stream != nullptr) {
+          stream->finish();
          constexpr bool kForceDestroy = true;
          hip::Stream::Destroy(stream, kForceDestroy);
        }
      }
    }
    parallel_streams_.clear();
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+    if (IsSegmentSchedulingEnabled()) {
      if (kernArgManager_ != nullptr) {
        kernArgManager_->release();
      }
    }

-    packetBatches_.clear();
-    nodeCaptureStatus_.clear();
+    segmentBatches_.clear();
  }

  Node GetClonedNode(Node node) {
@@ -885,9 +962,13 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
  static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
  hipError_t CaptureAndFormPacketsForGraph();
  void GetKernelArgSizeForGraph(std::unordered_map<int, size_t>& kernArgSizeForGraph);
-  hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
-  //! Enqueue a multi-device linear graph for execution
-  hipError_t EnqueueMultiDeviceLinearGraph(hip::Stream* hip_stream);
+
+  amd::Command* EnqueueSegmentedGraph(hip::Stream* launch_stream,
+                                      const std::vector<hip::Stream*>& streams,
+                                      hipError_t* out_status = nullptr);
+  hipError_t EnqueueSegment(const Segment& segment, hip::Stream* stream,
+                            amd::AccumulateCommand* accumulate);
+
  bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
  //! Update streams for the graph execution with launch stream from application
  void UpdateStreams(hip::Stream* launch_stream);
@@ -895,20 +976,41 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
  //! This method analyzes the stream-to-device mappings and recursively processes
  //! child graphs to determine the maximum concurrent streams needed per device
  void FindStreamsReqPerDev();
+  //! Find the number of streams required per device for packet engine mode
+  //! This method analyzes segments to determine per-device stream requirements
+  void FindStreamsReqPerDevForSegments();
+  //! Get the parallel streams map for synchronization before destruction
+  const std::unordered_map<int, std::vector<hip::Stream*>>& GetParallelStreams() const {
+    return parallel_streams_;
+  }

 protected:
+  //! Assign streams to segments at a given dependency level
+  void AssignStreamsToSegments(
+      const std::vector<int>& segments_at_level,
+      hip::Stream* launch_stream,
+      const std::vector<hip::Stream*>& streams,
+      std::unordered_map<int, hip::Stream*>& segment_to_stream);
+
  //! parallel streams per device
  std::unordered_map<int, std::vector<hip::Stream*>> parallel_streams_;
  uint64_t flags_ = 0;
  GraphKernelArgManager* kernArgManager_ = nullptr;  //!< Kernel Arg manager for graph.
  bool hasHiddenHeap_ = false;  //!< Hidden heap indicator for Kernel node
  bool repeatLaunch_ = false;
+  //!< Track last launch stream to avoid redundant UpdateStreams
+  hip::Stream* lastLaunchStream_ = nullptr;

  // PacketBatch structure
  struct PacketBatch {
    // Main dispatch vectors - always ready for batch dispatch
    std::vector<uint8_t*> dispatchPackets;
    std::vector<std::string> dispatchKernelNames;
+
+    // Cached filtered lists - built on-demand when nodes are disabled
+    std::vector<uint8_t*> enabledPackets;
+    std::vector<std::string> enabledKernelNames;
+
    // Node tracking
    struct NodeRange {
      size_t startIndex;    // Start index in dispatchPackets
@@ -921,13 +1023,22 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
    PacketBatch() {}
    // O(1) enable/disable operations - just update state
    void setEnabled(GraphNode* node, bool enabled);
+    // Rebuild cached filtered lists if cache is stale
+    void rebuildFilteredLists();
+  };
+
+  //! Structure linking packet batches to segments
+  struct SegmentBatch {
+    int segment_id;           // Segment this batch belongs to
+    std::vector<bool> node_capture_status; // Capture status for each node in this segment
+    std::vector<PacketBatch> packet_batches; // All packet batches for this segment
+
+    SegmentBatch(int seg_id) : segment_id(seg_id) {}
  };

  //! Batches of accumulated packets and kernel names for batch dispatch optimization
-  //! Each batch contains packets from consecutive captured nodes
-  std::vector<PacketBatch> packetBatches_;
-  //! Track which nodes were successfully captured (true) vs need individual execution (false)
-  std::vector<bool> nodeCaptureStatus_;
+  //! Map from segment ID to SegmentBatch for O(1) lookup
+  std::unordered_map<int, SegmentBatch> segmentBatches_;
 };

 class ChildGraphNode : public GraphNode, public GraphExec {
@@ -950,6 +1061,13 @@ class ChildGraphNode : public GraphNode, public GraphExec {

  bool GetGraphCaptureStatus() { return graphCaptureStatus_; }

+  bool GraphCaptureEnabled() override {
+    if (IsSegmentSchedulingEnabled()) {
+      return graphCaptureStatus_;
+    }
+    return false;
+  }
+
  std::vector<Node>& GetChildGraphNodeOrder() { return topoOrder_; }

  void SetStream(hip::Stream* stream) override { stream_ = stream; }
@@ -959,9 +1077,26 @@ class ChildGraphNode : public GraphNode, public GraphExec {
  }

  void EnqueueCommands(hip::Stream* stream) override {
-    if (graphCaptureStatus_) {
-      hipError_t status = EnqueueGraphWithSingleList(stream);
+    // Note: For segmented graphs, EnqueueSegment now calls EnqueueSegmentedGraph recursively
+    // This method is kept as a fallback for non-segmented execution or legacy paths
+
+    if (graphCaptureStatus_ || !segments_.empty()) {
+      // Use hierarchical segment-based enqueue via EnqueueSegmentedGraph
+      // Use this child graph's own parallel_streams_, so pass empty vector
+      hipError_t status = hipSuccess;
+      amd::Command* last_cmd = EnqueueSegmentedGraph(stream, {}, &status);
+
+      if (last_cmd != nullptr) {
+        // This is a fallback path - we don't need to track the command
+        last_cmd->release();
+      }
+
+      if (status != hipSuccess) {
+        ClPrint(amd::LOG_ERROR, amd::LOG_CODE,
+                "[hipGraph] ChildGraphNode::EnqueueCommands failed with status=%d", status);
+      }
    } else if (max_streams_ == 1) {
+      // Legacy topological order execution for non-segmented graphs
      for (int i = 0; i < topoOrder_.size(); i++) {
        topoOrder_[i]->SetStream(stream_);
        hipError_t status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -1054,6 +1189,7 @@ class GraphKernelNode : public GraphNode {
    out << GetLabel(flag);
    if (DEBUG_HIP_GRAPH_DOT_PRINT) {
      out << "StreamId:" << stream_id_;
+      out << "\nSegmentId:" << segment_id_;
      out << "\nSignalIsRequired: " << ((signal_is_required_) ? "true" : "false");
      out << "\nDeviceId:" << dev_id_;
    }
@@ -1137,7 +1273,7 @@ class GraphKernelNode : public GraphNode {
    }
    hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
    amd::Kernel* kernel = function->kernel();
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+    if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
      auto device = g_devices[dev_id_]->devices()[0];
      device::Kernel* devKernel = const_cast<device::Kernel*>(kernel->getDeviceKernel(*device));
      kernargSegmentByteSize_ = devKernel->KernargSegmentByteSize();
@@ -1270,6 +1406,11 @@ class GraphKernelNode : public GraphNode {
  GraphNode* clone() const override { return new GraphKernelNode(*this); }

  hipError_t CreateCommand(hip::Stream* stream) override {
+    // Clear commands_ first, even if node is disabled
+    hipError_t status = GraphNode::CreateCommand(stream);
+    if (status != hipSuccess) {
+      return status;
+    }
    if (!isEnabled_) {
      return hipSuccess;
    }
@@ -1280,14 +1421,10 @@ class GraphKernelNode : public GraphNode {
    hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
    amd::Kernel* kernel = function->kernel();
    amd::ScopedLock lock(function->dflock_);
-    hipError_t status = validateKernelParams(&kernelParams_, func, dev_id_);
+    status = validateKernelParams(&kernelParams_, func, dev_id_);
    if (hipSuccess != status) {
      return status;
    }
-    status = GraphNode::CreateCommand(stream);
-    if (status != hipSuccess) {
-      return status;
-    }
    commands_.reserve(1);
    amd::Command* command;
    uint32_t flags = 0;
@@ -1471,14 +1608,13 @@ class GraphKernelNode : public GraphNode {
  }

  virtual bool GraphCaptureEnabled() override {
-    bool isGraphCapture = false;
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+    if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
      // Disable capture for cooperative kernels
      if (!coopKernel_) {
-        isGraphCapture = true;
+        return true;
      }
    }
-    return isGraphCapture;
+    return false;
  }
 };

@@ -1500,15 +1636,16 @@ class GraphMemcpyNode : public GraphNode {
  GraphNode* clone() const override { return new GraphMemcpyNode(*this); }

  virtual hipError_t CreateCommand(hip::Stream* stream) override {
+    // Clear commands_ first, even if node is disabled
+    hipError_t status = GraphNode::CreateCommand(stream);
+    if (status != hipSuccess) {
+      return status;
+    }
    if (!isEnabled_ ||
        ((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
         IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr))) {
      return hipSuccess;
    }
-    hipError_t status = GraphNode::CreateCommand(stream);
-    if (status != hipSuccess) {
-      return status;
-    }
    commands_.reserve(1);
    amd::Command* command;
    status = ihipMemcpy3DCommand(command, &copyParams_, stream);
@@ -1632,17 +1769,16 @@ class GraphMemcpyNode : public GraphNode {
    }
  }
  virtual bool GraphCaptureEnabled() override {
-    bool isGraphCapture = false;
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+    if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
      switch (copyParams_.kind) {
        case hipMemcpyDeviceToDevice:
-          isGraphCapture = true;
+          return true;
          break;
        default:
          break;
      }
    }
-    return isGraphCapture;
+    return false;
  }
 };

@@ -1705,14 +1841,15 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
  GraphNode* clone() const override { return new GraphMemcpyNode1D(*this); }

  virtual hipError_t CreateCommand(hip::Stream* stream) override {
-    if (!isEnabled_ ||
-        ((kind_ == hipMemcpyHostToHost || kind_ == hipMemcpyDefault) && IsHtoHMemcpy(dst_, src_))) {
-      return hipSuccess;
-    }
+    // Clear commands_ first, even if node is disabled
    hipError_t status = GraphNode::CreateCommand(stream);
    if (status != hipSuccess) {
      return status;
    }
+    if (!isEnabled_ ||
+        ((kind_ == hipMemcpyHostToHost || kind_ == hipMemcpyDefault) && IsHtoHMemcpy(dst_, src_))) {
+      return hipSuccess;
+    }
    commands_.reserve(1);
    amd::Command* command = nullptr;
    if (!AMD_DIRECT_DISPATCH) {
@@ -1867,18 +2004,17 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
    }
  }
  virtual bool GraphCaptureEnabled() override {
-    bool isGraphCapture = false;
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+    if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
      hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_);
      switch (type) {
        case hipCopyBuffer:
-          isGraphCapture = true;
+          return true;
          break;
        default:
          break;
      }
    }
-    return isGraphCapture;
+    return false;
  }
 };

@@ -2139,6 +2275,13 @@ class GraphMemsetNode : public GraphNode {
    }
  }

+  virtual bool GraphCaptureEnabled() override {
+    if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
+      return true;
+    }
+    return false;
+  }
+
  hipError_t CreateCommand(hip::Stream* stream) override {
    hipError_t status = GraphNode::CreateCommand(stream);
    if (status != hipSuccess) {
@@ -2319,6 +2462,8 @@ class GraphHostNode : public GraphNode {
    amd::Command::EventWaitList waitList;
    commands_.reserve(1);
    amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList);
+    // This is just to invoke a callback, so no need to flush caches.
+    command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
    commands_.emplace_back(command);
    return hipSuccess;
  }
@@ -2333,6 +2478,9 @@ class GraphHostNode : public GraphNode {
      if (!commands_[0]->setCallback(CL_COMPLETE, GraphHostNode::Callback, &NodeParams_)) {
        ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during setCallback");
      }
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE,
+              "EnqueueCommands: NodeParams_.fn=%p, NodeParams_.userData=%p", NodeParams_.fn,
+              NodeParams_.userData);
      commands_[0]->enqueue();
      // Add the new barrier to stall the stream, until the callback is done
      amd::Command::EventWaitList eventWaitList;
@@ -2342,6 +2490,8 @@ class GraphHostNode : public GraphNode {
      if (block_command == nullptr) {
        ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during block command creation");
      }
+      // This is just to invoke a callback, so no need to flush caches.
+      block_command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
      block_command->enqueue();
      block_command->notifyCmdQueue();
      block_command->release();
@@ -47,6 +47,8 @@
 #define KCYN "\x1B[36m"
 #define KWHT "\x1B[37m"

+template <typename T> T ReturnPtrValue(T* ptr) { return (ptr != nullptr) ? *ptr : nullptr; }
+
 namespace hip{
  extern std::once_flag g_ihipInitialized;
 }
@@ -34,8 +34,6 @@ namespace hip {
 amd::Monitor hipArraySetLock{};
 std::unordered_set<hipArray*> hipArraySet;

-template <typename T> T ReturnPtrValue(T* ptr) { return (ptr != nullptr) ? *ptr : nullptr; }
-
 // ================================================================================================
 amd::Memory* getMemoryObject(const void* ptr, size_t& offset, size_t size) {
  auto memObj = amd::MemObjMap::FindMemObj(ptr, &offset);
@@ -131,7 +131,7 @@ Settings::Settings() {
                                                          : HIP_FORCE_DEV_KERNARG;

  limit_blit_wg_ = 16;
-  DEBUG_CLR_GRAPH_PACKET_CAPTURE = false;  // disable graph performance optimizations for PAL
+  DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING = 0;  // disable graph performance optimizations for PAL
 }

 bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -352,7 +352,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
    }
  }

-  // The hsa copy api would result in a dirty cache state
+  // The ROCR copy api guarantees coherency after the copy
  gpu().setFenceDirty(false);
  return true;
 }
@@ -590,7 +590,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c

  if (status == HSA_STATUS_SUCCESS) {
    gpu().addSystemScope();
-    // The hsa copy api would result in a dirty cache state
+    // The ROCR copy api guarantees coherency after the copy
    gpu().setFenceDirty(false);
  } else {
    gpu().Barriers().ResetCurrentSignal();
@@ -553,8 +553,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
        if (HSA_STATUS_SUCCESS != result) {
          LogError("hsa_amd_signal_async_handler() failed to set the handler!");
        } else {
-          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
-                  prof_signal->signal_.handle, prof_signal);
+          ClPrint(amd::LOG_INFO, amd::LOG_SIG,
+                  "Set Handler: handle(0x%lx), timestamp(%p), blocking CB=%d",
+                  prof_signal->signal_.handle, prof_signal,
+                  ts->command().Callback() != nullptr && ts->GetBlocking());
        }
      }
    }
@@ -1009,7 +1011,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui

  // Check for queue full and wait if needed.
  uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
-  fence_dirty_ = true;
+  setFenceDirty(true);

  if (addSystemScope_) {
    header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
@@ -1024,14 +1026,14 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui

  // Reset fence_dirty_ flag if we submit a packet with system scopes
  if (expected_fence_state == amd::Device::kCacheStateSystem) {
-    fence_dirty_ = false;
+    setFenceDirty(false);
  }

  // Dirty optimization to save on consequent dispatch packets which have requested flushes
  if (fence_state_ == amd::Device::kCacheStateSystem &&
      expected_fence_state == amd::Device::kCacheStateSystem) {
    header = dispatchPacketHeader_;
-    fence_dirty_ = true;
+    setFenceDirty(true);
  }

  fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
@@ -1076,7 +1078,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
  if (header != 0) {
    packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
  }
-  ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
          "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
          "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
          "setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], private_seg_size=%u, "
@@ -1204,12 +1206,18 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
      amd::Os::yield();
    }

-    fence_dirty_ = true;
+    setFenceDirty(true);

    // Save header of first packet in this batch
    AqlPacket* firstPacket = packets[processedPackets];
    uint16_t firstPacketHeader = firstPacket->header;
    uint16_t firstPacketRest = firstPacket->setup;
+    // Separate header for doorbell ring that can be modified
+    uint16_t doorbellHeader = firstPacketHeader;
+
+    // Save header of last packet in this batch (if different from first)
+    AqlPacket* lastPacket = packets[processedPackets + batchSize - 1];
+    uint16_t lastPacketHeader = lastPacket->header;

    // Process batchSize packets
    for (size_t i = 0; i < batchSize; ++i) {
@@ -1217,8 +1225,6 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
      uint64_t index = startIndex + i;

      AqlPacket* packet = packets[packetIndex];
-      uint16_t header = packet->header;
-

      bool attachSignal = timestamp_ != nullptr || attach_signal;

@@ -1247,84 +1253,105 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
      AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];

      // For first packet in batch, invalidate header before writing
-      if (i == 0) {
+      bool isFirstPacket = (i == 0);
+      bool isLastPacket = (i == batchSize - 1);
+
+      if (isFirstPacket) {
        if (addSystemScope_) {
-          // Add system scope on the acq on first packet
-          firstPacketHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
-          firstPacketHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
+          // Add system scope on the acq on first packet (modify doorbell header)
+          doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
+          doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
        }
+        // Invalidate the header of the first packet in the batch
        packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
+      }

-        // Copy the packet and then write the valid of the first packet
-        *aql_loc = *packet;
-
-        // Restore the header of the first packet
-        packet->header = firstPacketHeader;
-      } else {
-        // For the end packet in batch set flags
-        if (i == batchSize - 1) {
-          if (addSystemScope_) {
-            // Add system scope on the release on last packet
+      // For the end packet in batch set flags
+      if (isLastPacket) {
+        if (addSystemScope_) {
+          // If batch has only 1 packet, update doorbell header for release scope
+          // (packet->header is already invalid, so don't modify it)
+          if (batchSize == 1) {
+            doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+            doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+          } else {
+            // Add system scope on the release on last packet (different from first)
            packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
            packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
-            addSystemScope_ = false;
          }
-          auto expected_fence_state =
-              extractAqlBits(packet->header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                             HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
-          // Reset fence_dirty_ flag if we submit a packet with system scopes
-          if (expected_fence_state == amd::Device::kCacheStateSystem) {
-            fence_dirty_ = false;
-          }
-          fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
+          addSystemScope_ = false;
        }
-
-        // Copy the packet to the queue
-        *aql_loc = *packet;
+        // Use doorbellHeader for single packet batch (packet->header is invalid),
+        // else use packet->header
+        uint16_t headerForFenceState = (batchSize == 1) ? doorbellHeader : packet->header;
+        auto expected_fence_state =
+            extractAqlBits(headerForFenceState, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                           HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+        // Reset fence_dirty_ flag if we submit a packet with system scopes
+        if (expected_fence_state == amd::Device::kCacheStateSystem) {
+          setFenceDirty(false);
+        }
+        fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
      }

+      // Copy the packet to the queue
+      *aql_loc = *packet;
+
      // Print kernel name for kernel dispatch packets
      if (kernelNames && packetIndex < kernelNames->size()) {
+        // Use doorbellHeader for first packet (packet->header is invalid), else use packet->header
+        uint16_t headerForPrint = isFirstPacket ? doorbellHeader : packet->header;
        uint8_t packetType =
-            extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
+            extractAqlBits(headerForPrint, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
        if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2, "Graph ShaderName : %s, device id : %u",
                  (*kernelNames)[packetIndex].c_str(), dev().index());

-          ClPrint(
-              amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
-              "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
-              "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
-              "setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
-              "private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
-              "kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
-              "rptr=%u, wptr=%u",
-              gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
-              extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
-              extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
-                             HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-              extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                             HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
-              packet->setup, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
-              reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
-              Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
+          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
+                  "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
+                  "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
+                  "setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
+                  "private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
+                  "kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
+                  "rptr=%u, wptr=%u",
+                  gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, headerForPrint, packetType,
+                  extractAqlBits(headerForPrint, HSA_PACKET_HEADER_BARRIER,
+                                 HSA_PACKET_HEADER_WIDTH_BARRIER),
+                  extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
+                                 HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
+                  extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                                 HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+                  packet->setup,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
+                  Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
        }
      }
+
+      // Restore the header of the first packet
+      if (isFirstPacket) {
+        packet->header = firstPacketHeader;
+      }
+
+      // Restore the header of the last packet (if different from first)
+      if (isLastPacket && batchSize > 1) {
+        packet->header = lastPacketHeader;
+      }
    }

    // Write valid header for the first packet in the batch
    AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask];
-    packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), firstPacketHeader, firstPacketRest);
+    packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), doorbellHeader, firstPacketRest);

    // Ring doorbell for this batch
    Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, startIndex);
@@ -1367,8 +1394,7 @@ bool VirtualGPU::dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,

  dispatchBlockingWait();

-  // Add all kernel names in bulk
-  vcmd->addKernelNames(kernelNames);
+  vcmd->setKernelNamesRef(&kernelNames);

  // Dispatch all packets with a single doorbell ring
  // Cast packets vector to AQL packets vector on the fly
@@ -1428,7 +1454,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
  uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
  uint64_t read = Hsa::queue_load_read_index_relaxed(gpu_queue_);

-  fence_dirty_ = true;
+  setFenceDirty(true);
  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                                    HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
  if (!skipSignal && (signal.handle == 0)) {
@@ -1443,7 +1469,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,

  // Reset fence_dirty_ flag if we submit a barrier with system scopes
  if (cache_state == amd::Device::kCacheStateSystem) {
-    fence_dirty_ = false;
+    setFenceDirty(false);
  }

  while ((index - Hsa::queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1453,7 +1479,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
  packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, 0);

  Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);
-  ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
          "SWq=0x%zx, HWq=0x%zx, id=%d, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d,"
          " release=%d), "
          "dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], completion_signal=0x%zx, "
@@ -1512,7 +1538,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
    }
  }

-  fence_dirty_ = true;
+  setFenceDirty(true);
  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                                    HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);

@@ -1527,7 +1553,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD

  // Reset fence_dirty_ flag if we submit a barrier
  if (cache_state == amd::Device::kCacheStateSystem) {
-    fence_dirty_ = false;
+    setFenceDirty(false);
  }

  uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
@@ -1543,7 +1569,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD

  Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);

-  ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
          "SWq=0x%zx, HWq=0x%zx, id=%d, BarrierValue Header = 0x%x AmdFormat = 0x%x "
          "(type=%d, barrier=%d, acquire=%d, release=%d), "
          "signal=0x%zx, value = 0x%llx mask = 0x%llx cond: %s, completion_signal=0x%zx, "
@@ -1576,7 +1602,7 @@ void VirtualGPU::ResetQueueStates() {

 // ================================================================================================
 bool VirtualGPU::releaseGpuMemoryFence(bool skip_cpu_wait) {
-  if (hasPendingDispatch_ || !Barriers().IsExternalSignalListEmpty()) {
+  if (hasPendingDispatch_ || isFenceDirty() || !Barriers().IsExternalSignalListEmpty()) {
    // Dispatch barrier packet into the queue
    dispatchBarrierPacket(kBarrierPacketHeader);
    hasPendingDispatch_ = false;
@@ -1944,6 +1970,17 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
      } else {
        // Assume serialization on the same queue...
      }
+
+      // Check if the waiting event's queue has a dirty fence and propagate it
+      if (!isFenceDirty()) {
+        amd::Command* wait_cmd = static_cast<amd::Command*>(*it);
+        if (wait_cmd->queue() != nullptr && wait_cmd->queue() != command.queue()) {
+          device::VirtualDevice* wait_vdev = wait_cmd->queue()->vdev();
+          if (wait_vdev != nullptr && wait_vdev->isFenceDirty()) {
+            setFenceDirty(true);
+          }
+        }
+      }
    }
  }
 }
@@ -3688,7 +3725,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    if (isGraphCapture) {
      argBuffer = command_->getGraphKernArg(gpuKernel.KernargSegmentByteSize(),
                                            gpuKernel.KernargSegmentAlignment(), dev().index());
-      command_->SetKernelName(gpuKernel.getDemangledName().c_str());
+      command_->SetKernelName(gpuKernel.getDemangledName());
    } else {
      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN,
              "KernargSegmentByteSize = %lu "
@@ -3916,6 +3953,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
      if (timestamp_ != nullptr) {
        const Settings& settings = dev().settings();
        int32_t releaseFlags = vcmd.getCommandEntryScope();
+
        if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
          if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
            dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
@@ -448,8 +448,8 @@ class VirtualGPU : public device::VirtualDevice {
  amd::Command* command() const { return command_; }

  void* allocKernArg(size_t size, size_t alignment);
-  bool isFenceDirty() const { return fence_dirty_; }
-  void setFenceDirty(bool state) { fence_dirty_ = state; }
+  bool isFenceDirty() const { return fence_dirty_.load(std::memory_order_acquire); }
+  void setFenceDirty(bool state) { fence_dirty_.store(state, std::memory_order_release); }
  void WaitCompleteSignal(hsa_signal_t signal);

  void HiddenHeapInit();
@@ -1383,6 +1383,7 @@ class AccumulateCommand : public Command {
 private:
  //! Kernel names and timestamps list for activity profiling
  std::vector<std::string> kernelNames_;
+  const std::vector<std::string>* kernelNamesRef_ = nullptr;
  std::vector<std::pair<uint64_t, uint64_t>> tsList_;

 public:
@@ -1399,13 +1400,20 @@ class AccumulateCommand : public Command {
    kernelNames_.insert(kernelNames_.end(), kernelNames.begin(), kernelNames.end());
  }

+  //! Set kernel names by reference
+  void setKernelNamesRef(const std::vector<std::string>* kernelNames) {
+    kernelNamesRef_ = kernelNames;
+  }
+
  //! Add kernel timestamp to the list if available
  void addTimestamps(uint64_t startTs, uint64_t endTs) {
    tsList_.push_back(std::make_pair(startTs, endTs));
  }

  //! Return the kernel names
-  const std::vector<std::string>& getKernelNames() const { return kernelNames_; }
+  const std::vector<std::string>& getKernelNames() const {
+    return kernelNamesRef_ != nullptr ? *kernelNamesRef_ : kernelNames_;
+  }

  //! Return the kernel timestamps
  const std::vector<std::pair<uint64_t, uint64_t>>& getTimestamps() const { return tsList_; }
@@ -237,8 +237,6 @@ release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi,                                  \
        "Set initial heap size for device malloc.")                           \
 release(bool, HIP_FORCE_DEV_KERNARG, true,                                    \
         "Force device mem for kernel args.")                                 \
-release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true,                           \
-         "Enable/Disable graph packet capturing")                             \
 release(bool, GPU_DEBUG_ENABLE, false,                                        \
        "Enables collection of extra info for debugger at some perf cost")    \
 release(cstring, HIPRTC_COMPILE_OPTIONS_APPEND, "",                           \
@@ -255,6 +253,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4,                                \
        "Forces the number of streams for the graph parallel execution")      \
 release(uint, DEBUG_HIP_GRAPH_BATCH_SIZE, 256,                                \
        "Number of graph nodes to batch at a time")                           \
+release(uint, DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING, 1,                          \
+        "0 = Disable, 1 = Enable, 2 = Force")                                 \
 release(uint, DEBUG_HIP_BLOCK_SYNC, 50,                                       \
        "Blocks synchronization on CPU until the callback processing is done")\
 release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000,                                 \