clr: Use graph segment scheduling to process HIP Graphs (#1372)

* clr: Use graph segment scheduling to process HIP Graphs

* Add a broader path to use capture packet capture for all topologies
* Refactor code
* Use DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING to toggle new vs classic path,
  Enabled by default

* clr: Few fixes and improvements

* clr: Detect complex graphs to take classic path

* Use DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING=2 to force segment scheduling
  path

* clr: Fix a cornercase stack corruption

* clr: Track commands of segments instead of snapshots

* clr: Fix Batch dispatch logic

* Track fence_dirty_ flag for command of other streams
* Dependency resolution markers can now accomodate dirty fence on cross
  streams

---------

Co-authored-by: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Co-authored-by: Godavarthy Surya, Anusha <agodavar@amd.com>
Этот коммит содержится в:
SaleelK
2025-12-01 12:49:26 -08:00
коммит произвёл GitHub
родитель a627c12501
Коммит c105dcd05b
11 изменённых файлов: 1563 добавлений и 507 удалений
+60 -43
Просмотреть файл
@@ -1425,9 +1425,9 @@ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraph
if (status != hipSuccess) {
HIP_RETURN(status);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled()) {
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
}
HIP_RETURN(status);
}
@@ -1523,12 +1523,14 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
return hipErrorOutOfMemory;
}
graph->clone(*pGraphExec, true);
(*pGraphExec)->ScheduleNodes();
if (false == (*pGraphExec)->TopologicalOrder()) {
hipError_t scheduleStatus = (*pGraphExec)->ScheduleNodes();
if (scheduleStatus != hipSuccess) {
delete *pGraphExec;
return hipErrorInvalidValue;
*pGraphExec = nullptr;
return scheduleStatus;
}
graph->SetGraphInstantiated(true);
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
static int i = 1;
std::string filename =
@@ -1538,7 +1540,10 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str());
}
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
graph->SetGraphInstantiated(true);
if ((*pGraphExec)->IsSegmentSchedulingEnabled()) {
(*pGraphExec)->SetKernelArgManager(new hip::GraphKernelArgManager());
}
return (*pGraphExec)->Init();
@@ -1555,7 +1560,7 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
if (status == hipSuccess) {
*pGraphExec = reinterpret_cast<hipGraphExec_t>(ge);
}
HIP_RETURN(status);
HIP_RETURN(status, ReturnPtrValue(pGraphExec));
}
hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
@@ -1574,7 +1579,7 @@ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t g
hip::GraphExec* ge;
hipError_t status = ihipGraphInstantiate(&ge, reinterpret_cast<hip::Graph*>(graph), flags);
*pGraphExec = reinterpret_cast<hipGraphExec_t>(ge);
HIP_RETURN(status);
HIP_RETURN(status, ReturnPtrValue(pGraphExec));
}
hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
@@ -1609,7 +1614,7 @@ hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t
HIP_RETURN(status);
}
HIP_RETURN(hipSuccess);
HIP_RETURN(hipSuccess, ReturnPtrValue(pGraphExec));
}
hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) {
@@ -1820,9 +1825,9 @@ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
if (status != hipSuccess) {
HIP_RETURN(status);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled()) {
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
}
HIP_RETURN(status);
}
@@ -1871,9 +1876,9 @@ hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
if (status != hipSuccess) {
HIP_RETURN(status);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled()) {
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
}
HIP_RETURN(status);
}
@@ -1931,9 +1936,9 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
if (status != hipSuccess) {
HIP_RETURN(status);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled()) {
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
}
HIP_RETURN(status);
}
@@ -2008,13 +2013,18 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
if (status != hipSuccess) {
return status;
}
if (reinterpret_cast<hip::ChildGraphNode*>(clonedNode)->GetGraphCaptureStatus()) {
hip::ChildGraphNode* childNode = reinterpret_cast<hip::ChildGraphNode*>(clonedNode);
// After SetParams updates node parameters in-place, we need to update the cached AQL packets
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled() || childNode->GetGraphCaptureStatus()) {
std::vector<hip::GraphNode*> childGraphNodes;
reinterpret_cast<hip::ChildGraphNode*>(clonedNode)->TopologicalOrder(childGraphNodes);
childNode->TopologicalOrder(childGraphNodes);
for (std::vector<hip::GraphNode*>::size_type i = 0; i != childGraphNodes.size(); i++) {
if (childGraphNodes[i]->GraphCaptureEnabled()) {
status = reinterpret_cast<hip::ChildGraphNode*>(clonedNode)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
status =
childNode->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
if (status != hipSuccess) {
return status;
}
@@ -2414,9 +2424,9 @@ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec,
if (status != hipSuccess) {
HIP_RETURN(status);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled()) {
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
}
HIP_RETURN(status);
}
@@ -2497,9 +2507,9 @@ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hi
if (status != hipSuccess) {
HIP_RETURN(status);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled()) {
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
}
HIP_RETURN(status);
}
@@ -2734,10 +2744,11 @@ hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph,
*updateResult_out = hipGraphExecUpdateErrorNotSupported;
}
HIP_RETURN(hipErrorGraphExecUpdateFailure);
} else if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && newGraphNodes[i]->GraphCaptureEnabled()) {
status =
reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(oldGraphExecNodes[i]));
} else {
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled() && newGraphNodes[i]->GraphCaptureEnabled()) {
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(oldGraphExecNodes[i]));
}
}
} else {
*hErrorNode_out = reinterpret_cast<hipGraphNode_t>(newGraphNodes[i]);
@@ -3091,12 +3102,16 @@ hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNod
HIP_RETURN(hipErrorInvalidValue);
}
clonedNode->SetEnabled(isEnabled);
// Update packet batches when node is enabled/disabled
hipError_t status = graphExec->UpdatePacketBatchesForNodeEnableDisable(clonedNode, isEnabled != 0);
if (status != hipSuccess) {
HIP_RETURN(status);
hipError_t status = hipSuccess;
if (graphExec->IsSegmentSchedulingEnabled()) {
// Update packet batches when node is enabled/disabled
status = graphExec->UpdatePacketBatchesForNodeEnableDisable(clonedNode, isEnabled != 0);
if (status != hipSuccess) {
HIP_RETURN(status);
}
}
HIP_RETURN(hipSuccess);
HIP_RETURN(status);
}
hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
@@ -3449,8 +3464,9 @@ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGrap
if (status != hipSuccess) {
HIP_RETURN(status);
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)->UpdateAQLPacket(clonedNode);
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
if (graphExec->IsSegmentSchedulingEnabled()) {
status = graphExec->UpdateAQLPacket(clonedNode);
}
HIP_RETURN(status);
}
@@ -3572,8 +3588,9 @@ hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t no
return status;
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
status = reinterpret_cast<hip::GraphExec*>(graphExec)->UpdateAQLPacket(clonedNode);
auto graphExecPtr = reinterpret_cast<hip::GraphExec*>(graphExec);
if (graphExecPtr->IsSegmentSchedulingEnabled()) {
status = graphExecPtr->UpdateAQLPacket(clonedNode);
}
return status;
}
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+204 -54
Просмотреть файл
@@ -320,6 +320,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
const std::vector<Node>& GetDependencies() const { return dependencies_; }
/// Update graph node dependecies
void SetDependencies(std::vector<Node>& dependencies) {
dependencies_.clear();
for (auto entry : dependencies) {
dependencies_.push_back(entry);
}
@@ -366,6 +367,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
const std::vector<Node>& GetEdges() const { return edges_; }
/// Updates graph node children
void SetEdges(std::vector<Node>& edges) {
edges_.clear();
for (auto entry : edges) {
edges_.push_back(entry);
}
@@ -425,19 +427,10 @@ class GraphNode : public hipGraphNodeDOTAttribute {
}
unsigned int GetEnabled() const { return isEnabled_; }
void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
// Returns true if capture is enabled for the current node.
// Base implementation returns false; specific node types should override.
virtual bool GraphCaptureEnabled() {
bool isGraphCapture = false;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
switch (GetType()) {
case hipGraphNodeTypeMemset:
isGraphCapture = true;
break;
default:
break;
}
}
return isGraphCapture;
return false;
}
virtual void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override {
out << "[";
@@ -454,6 +447,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
out << GetLabel(flag);
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
out << "\nStreamId:" << stream_id_;
out << "\nSegmentId:" << segment_id_;
out << "\nSignalIsRequired: " << ((signal_is_required_) ? "true" : "false");
out << "\nDeviceId:" << dev_id_;
}
@@ -479,6 +473,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
size_t inDegree_; //!< count of in coming edges (@todo: remove, it's dependencies_.size())
size_t outDegree_; //!< count of outgoing edges (@todo: remove, it's edges_.size())
int32_t stream_id_ = -1; //! Stream ID on which this node will be executed
int32_t segment_id_ = -1; //! Segment ID on which this node will be executed
int32_t launch_id_ = -1; //! Launch ID of this node in the entire graph execution sequence
static int nextID;
Graph* parentGraph_;
@@ -556,6 +551,8 @@ class Graph {
graphSet_.insert(this);
mem_pool_ = device->GetGraphMemoryPool();
graphInstantiated_ = false;
// Initialize per-graph segment scheduling flag from global env var
use_segment_scheduling_ = DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING;
roots_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
leafs_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
wait_order_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
@@ -568,7 +565,7 @@ class Graph {
}
}
}
~Graph() {
virtual ~Graph() {
for (auto node : vertices_) {
delete node;
}
@@ -639,6 +636,8 @@ class Graph {
const std::vector<Node>& GetTopoOrder() const { return topoOrder_; }
/// returns all the edges in the graph
std::vector<std::pair<Node, Node>> GetEdges() const;
/// Returns whether segment scheduling is enabled for this graph
bool IsSegmentSchedulingEnabled() const { return use_segment_scheduling_; }
// returns the original graph ptr if cloned
const Graph* getOriginalGraph() const { return pOriginalGraph_; }
// Add user obj resource to graph
@@ -679,7 +678,43 @@ class Graph {
);
//! Schedules all nodes in the graph into different streams
void ScheduleNodes();
hipError_t ScheduleNodes();
// Hierarchical path structure for child graph support
struct HierarchicalPath {
std::vector<Node> nodes; //!< Nodes in this path (at this level only)
Node child_graph_node = nullptr; //!< Reference to child graph node if present in path
int child_graph_paths_index = -1; //!< Index into child_graph_paths (-1 if no child)
int device_id = -1; //!< Device ID for this path
};
// Structure to store execution paths for a graph and its children hierarchically
struct GraphExecutionPaths {
Graph* graph_ptr = nullptr; //!< Pointer to the graph this belongs to
std::vector<HierarchicalPath> paths; //!< All execution paths at this level only
std::vector<GraphExecutionPaths> child_graph_paths; //!< Child graph execution paths
};
//! Schedules nodes into batches for optimized execution
hipError_t ScheduleNodesIntoBatches();
//! Find execution paths hierarchically, keeping child graphs separate
GraphExecutionPaths FindExecutionPathsHierarchical();
//! Recursively find all paths from a node with hierarchical child graph handling
void FindPathsRecursiveHierarchical(Node node,
std::vector<Node>& current_path,
std::unordered_set<unsigned int>& visited,
GraphExecutionPaths& graph_paths);
//! Create segments from hierarchical execution paths
void CreateSegmentsFromPaths(const GraphExecutionPaths& exec_paths);
//! Resolve dependencies between segments
void ResolveSegmentDependencies();
//! Calculate dependency levels for segments using topological sort
void CalculateSegmentTopoDependencyLevels();
//! Runs one node on the assigned stream
bool RunOneNode(Node node); //!< Node for the execution on GPU
@@ -785,8 +820,35 @@ class Graph {
//!< during multi-device graph execution scheduling.
std::unordered_map<int, std::set<int>> streams_dev_ids_;
int instantiateDeviceId_ = -1;
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
std::vector<Node> topoOrder_;
// Segment dependency structures
struct Segment {
int id = -1;
int stream_id = -1; // Assigned stream for this segment
int dependency_level = -1; // Topological level (0 = root, 1 = depends on root, etc.)
std::vector<Node> nodes;
std::vector<int> segment_ids_dependencies; // Segments this segment depends on (within same graph)
std::vector<int> segment_ids_edges; // Segments that depend on this segment (within same graph)
Node first_node = nullptr;
Node last_node = nullptr;
// Hierarchical child graph information
Graph* child_graph_ptr = nullptr; // Direct pointer to child graph for quick access
};
//! Segment information for batch scheduling
std::vector<Segment> segments_;
//! Map of node to segment ID
std::unordered_map<Node, int> node_to_segment_id_;
//! Maximum dependency level in the segment graph
int max_dependency_level_ = -1;
//!< Map of dependency level to list of segment IDs at that level
std::unordered_map<int, std::vector<int>> segments_per_level_;
std::unordered_map<Node, Node> clonedNodes_;
private:
friend class GraphExec;
std::vector<Node> vertices_;
@@ -807,7 +869,10 @@ class Graph {
hip::MemoryPool* mem_pool_; //!< Memory pool, associated with this graph
std::unordered_set<GraphNode*> capturedNodes_;
bool graphInstantiated_;
std::unordered_map<Node, Node> clonedNodes_;
//!< Per-graph flag to control segment scheduling
//!< Can be disabled per-graph for complex graphs that benefit from classic path
bool use_segment_scheduling_;
//! Map of device ID to vector of streams allocated for that device during graph execution.
//! Each device may require multiple streams to handle parallel execution of graph nodes.
std::unordered_map<int, std::vector<hip::Stream*>> streams_dev_;
@@ -815,6 +880,17 @@ class Graph {
//! Map tracking the maximum number of concurrent streams required per device for graph execution.
//! Key: device ID, Value: maximum number of streams needed for that device
std::unordered_map<int, int> max_streams_dev_;
// Batch-based scheduling structures
struct Batch {
int id = -1;
int stream_id = 0;
std::vector<Node> nodes;
std::vector<int> incoming_stream_ids;
Node last_node = nullptr;
};
std::vector<Batch> batches_;
};
class GraphExec : public amd::ReferenceCountedObject, public Graph {
@@ -822,6 +898,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
static std::unordered_set<GraphExec*> graphExecSet_;
static amd::Monitor graphExecSetLock_;
static amd::Monitor graphExecStreamCreateLock_;
bool graph_dumped_ = false;
GraphExec(uint64_t flags = 0)
: ReferenceCountedObject(), Graph(hip::getCurrentDevice()), flags_(flags) {
amd::ScopedLock lock(graphExecSetLock_);
@@ -832,20 +909,20 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
for (auto streams : parallel_streams_) {
for (auto stream : streams.second) {
if (stream != nullptr) {
stream->finish();
constexpr bool kForceDestroy = true;
hip::Stream::Destroy(stream, kForceDestroy);
}
}
}
parallel_streams_.clear();
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
if (IsSegmentSchedulingEnabled()) {
if (kernArgManager_ != nullptr) {
kernArgManager_->release();
}
}
packetBatches_.clear();
nodeCaptureStatus_.clear();
segmentBatches_.clear();
}
Node GetClonedNode(Node node) {
@@ -885,9 +962,13 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
hipError_t CaptureAndFormPacketsForGraph();
void GetKernelArgSizeForGraph(std::unordered_map<int, size_t>& kernArgSizeForGraph);
hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
//! Enqueue a multi-device linear graph for execution
hipError_t EnqueueMultiDeviceLinearGraph(hip::Stream* hip_stream);
amd::Command* EnqueueSegmentedGraph(hip::Stream* launch_stream,
const std::vector<hip::Stream*>& streams,
hipError_t* out_status = nullptr);
hipError_t EnqueueSegment(const Segment& segment, hip::Stream* stream,
amd::AccumulateCommand* accumulate);
bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
//! Update streams for the graph execution with launch stream from application
void UpdateStreams(hip::Stream* launch_stream);
@@ -895,20 +976,41 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
//! This method analyzes the stream-to-device mappings and recursively processes
//! child graphs to determine the maximum concurrent streams needed per device
void FindStreamsReqPerDev();
//! Find the number of streams required per device for packet engine mode
//! This method analyzes segments to determine per-device stream requirements
void FindStreamsReqPerDevForSegments();
//! Get the parallel streams map for synchronization before destruction
const std::unordered_map<int, std::vector<hip::Stream*>>& GetParallelStreams() const {
return parallel_streams_;
}
protected:
//! Assign streams to segments at a given dependency level
void AssignStreamsToSegments(
const std::vector<int>& segments_at_level,
hip::Stream* launch_stream,
const std::vector<hip::Stream*>& streams,
std::unordered_map<int, hip::Stream*>& segment_to_stream);
//! parallel streams per device
std::unordered_map<int, std::vector<hip::Stream*>> parallel_streams_;
uint64_t flags_ = 0;
GraphKernelArgManager* kernArgManager_ = nullptr; //!< Kernel Arg manager for graph.
bool hasHiddenHeap_ = false; //!< Hidden heap indicator for Kernel node
bool repeatLaunch_ = false;
//!< Track last launch stream to avoid redundant UpdateStreams
hip::Stream* lastLaunchStream_ = nullptr;
// PacketBatch structure
struct PacketBatch {
// Main dispatch vectors - always ready for batch dispatch
std::vector<uint8_t*> dispatchPackets;
std::vector<std::string> dispatchKernelNames;
// Cached filtered lists - built on-demand when nodes are disabled
std::vector<uint8_t*> enabledPackets;
std::vector<std::string> enabledKernelNames;
// Node tracking
struct NodeRange {
size_t startIndex; // Start index in dispatchPackets
@@ -921,13 +1023,22 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
PacketBatch() {}
// O(1) enable/disable operations - just update state
void setEnabled(GraphNode* node, bool enabled);
// Rebuild cached filtered lists if cache is stale
void rebuildFilteredLists();
};
//! Structure linking packet batches to segments
struct SegmentBatch {
int segment_id; // Segment this batch belongs to
std::vector<bool> node_capture_status; // Capture status for each node in this segment
std::vector<PacketBatch> packet_batches; // All packet batches for this segment
SegmentBatch(int seg_id) : segment_id(seg_id) {}
};
//! Batches of accumulated packets and kernel names for batch dispatch optimization
//! Each batch contains packets from consecutive captured nodes
std::vector<PacketBatch> packetBatches_;
//! Track which nodes were successfully captured (true) vs need individual execution (false)
std::vector<bool> nodeCaptureStatus_;
//! Map from segment ID to SegmentBatch for O(1) lookup
std::unordered_map<int, SegmentBatch> segmentBatches_;
};
class ChildGraphNode : public GraphNode, public GraphExec {
@@ -950,6 +1061,13 @@ class ChildGraphNode : public GraphNode, public GraphExec {
bool GetGraphCaptureStatus() { return graphCaptureStatus_; }
bool GraphCaptureEnabled() override {
if (IsSegmentSchedulingEnabled()) {
return graphCaptureStatus_;
}
return false;
}
std::vector<Node>& GetChildGraphNodeOrder() { return topoOrder_; }
void SetStream(hip::Stream* stream) override { stream_ = stream; }
@@ -959,9 +1077,26 @@ class ChildGraphNode : public GraphNode, public GraphExec {
}
void EnqueueCommands(hip::Stream* stream) override {
if (graphCaptureStatus_) {
hipError_t status = EnqueueGraphWithSingleList(stream);
// Note: For segmented graphs, EnqueueSegment now calls EnqueueSegmentedGraph recursively
// This method is kept as a fallback for non-segmented execution or legacy paths
if (graphCaptureStatus_ || !segments_.empty()) {
// Use hierarchical segment-based enqueue via EnqueueSegmentedGraph
// Use this child graph's own parallel_streams_, so pass empty vector
hipError_t status = hipSuccess;
amd::Command* last_cmd = EnqueueSegmentedGraph(stream, {}, &status);
if (last_cmd != nullptr) {
// This is a fallback path - we don't need to track the command
last_cmd->release();
}
if (status != hipSuccess) {
ClPrint(amd::LOG_ERROR, amd::LOG_CODE,
"[hipGraph] ChildGraphNode::EnqueueCommands failed with status=%d", status);
}
} else if (max_streams_ == 1) {
// Legacy topological order execution for non-segmented graphs
for (int i = 0; i < topoOrder_.size(); i++) {
topoOrder_[i]->SetStream(stream_);
hipError_t status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -1054,6 +1189,7 @@ class GraphKernelNode : public GraphNode {
out << GetLabel(flag);
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
out << "StreamId:" << stream_id_;
out << "\nSegmentId:" << segment_id_;
out << "\nSignalIsRequired: " << ((signal_is_required_) ? "true" : "false");
out << "\nDeviceId:" << dev_id_;
}
@@ -1137,7 +1273,7 @@ class GraphKernelNode : public GraphNode {
}
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
amd::Kernel* kernel = function->kernel();
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
auto device = g_devices[dev_id_]->devices()[0];
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel->getDeviceKernel(*device));
kernargSegmentByteSize_ = devKernel->KernargSegmentByteSize();
@@ -1270,6 +1406,11 @@ class GraphKernelNode : public GraphNode {
GraphNode* clone() const override { return new GraphKernelNode(*this); }
hipError_t CreateCommand(hip::Stream* stream) override {
// Clear commands_ first, even if node is disabled
hipError_t status = GraphNode::CreateCommand(stream);
if (status != hipSuccess) {
return status;
}
if (!isEnabled_) {
return hipSuccess;
}
@@ -1280,14 +1421,10 @@ class GraphKernelNode : public GraphNode {
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
amd::Kernel* kernel = function->kernel();
amd::ScopedLock lock(function->dflock_);
hipError_t status = validateKernelParams(&kernelParams_, func, dev_id_);
status = validateKernelParams(&kernelParams_, func, dev_id_);
if (hipSuccess != status) {
return status;
}
status = GraphNode::CreateCommand(stream);
if (status != hipSuccess) {
return status;
}
commands_.reserve(1);
amd::Command* command;
uint32_t flags = 0;
@@ -1471,14 +1608,13 @@ class GraphKernelNode : public GraphNode {
}
virtual bool GraphCaptureEnabled() override {
bool isGraphCapture = false;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
// Disable capture for cooperative kernels
if (!coopKernel_) {
isGraphCapture = true;
return true;
}
}
return isGraphCapture;
return false;
}
};
@@ -1500,15 +1636,16 @@ class GraphMemcpyNode : public GraphNode {
GraphNode* clone() const override { return new GraphMemcpyNode(*this); }
virtual hipError_t CreateCommand(hip::Stream* stream) override {
// Clear commands_ first, even if node is disabled
hipError_t status = GraphNode::CreateCommand(stream);
if (status != hipSuccess) {
return status;
}
if (!isEnabled_ ||
((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr))) {
return hipSuccess;
}
hipError_t status = GraphNode::CreateCommand(stream);
if (status != hipSuccess) {
return status;
}
commands_.reserve(1);
amd::Command* command;
status = ihipMemcpy3DCommand(command, &copyParams_, stream);
@@ -1632,17 +1769,16 @@ class GraphMemcpyNode : public GraphNode {
}
}
virtual bool GraphCaptureEnabled() override {
bool isGraphCapture = false;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
switch (copyParams_.kind) {
case hipMemcpyDeviceToDevice:
isGraphCapture = true;
return true;
break;
default:
break;
}
}
return isGraphCapture;
return false;
}
};
@@ -1705,14 +1841,15 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
GraphNode* clone() const override { return new GraphMemcpyNode1D(*this); }
virtual hipError_t CreateCommand(hip::Stream* stream) override {
if (!isEnabled_ ||
((kind_ == hipMemcpyHostToHost || kind_ == hipMemcpyDefault) && IsHtoHMemcpy(dst_, src_))) {
return hipSuccess;
}
// Clear commands_ first, even if node is disabled
hipError_t status = GraphNode::CreateCommand(stream);
if (status != hipSuccess) {
return status;
}
if (!isEnabled_ ||
((kind_ == hipMemcpyHostToHost || kind_ == hipMemcpyDefault) && IsHtoHMemcpy(dst_, src_))) {
return hipSuccess;
}
commands_.reserve(1);
amd::Command* command = nullptr;
if (!AMD_DIRECT_DISPATCH) {
@@ -1867,18 +2004,17 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
}
}
virtual bool GraphCaptureEnabled() override {
bool isGraphCapture = false;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_);
switch (type) {
case hipCopyBuffer:
isGraphCapture = true;
return true;
break;
default:
break;
}
}
return isGraphCapture;
return false;
}
};
@@ -2139,6 +2275,13 @@ class GraphMemsetNode : public GraphNode {
}
}
virtual bool GraphCaptureEnabled() override {
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
return true;
}
return false;
}
hipError_t CreateCommand(hip::Stream* stream) override {
hipError_t status = GraphNode::CreateCommand(stream);
if (status != hipSuccess) {
@@ -2319,6 +2462,8 @@ class GraphHostNode : public GraphNode {
amd::Command::EventWaitList waitList;
commands_.reserve(1);
amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList);
// This is just to invoke a callback, so no need to flush caches.
command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
commands_.emplace_back(command);
return hipSuccess;
}
@@ -2333,6 +2478,9 @@ class GraphHostNode : public GraphNode {
if (!commands_[0]->setCallback(CL_COMPLETE, GraphHostNode::Callback, &NodeParams_)) {
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during setCallback");
}
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE,
"EnqueueCommands: NodeParams_.fn=%p, NodeParams_.userData=%p", NodeParams_.fn,
NodeParams_.userData);
commands_[0]->enqueue();
// Add the new barrier to stall the stream, until the callback is done
amd::Command::EventWaitList eventWaitList;
@@ -2342,6 +2490,8 @@ class GraphHostNode : public GraphNode {
if (block_command == nullptr) {
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during block command creation");
}
// This is just to invoke a callback, so no need to flush caches.
block_command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
block_command->enqueue();
block_command->notifyCmdQueue();
block_command->release();
+2
Просмотреть файл
@@ -47,6 +47,8 @@
#define KCYN "\x1B[36m"
#define KWHT "\x1B[37m"
template <typename T> T ReturnPtrValue(T* ptr) { return (ptr != nullptr) ? *ptr : nullptr; }
namespace hip{
extern std::once_flag g_ihipInitialized;
}
-2
Просмотреть файл
@@ -34,8 +34,6 @@ namespace hip {
amd::Monitor hipArraySetLock{};
std::unordered_set<hipArray*> hipArraySet;
template <typename T> T ReturnPtrValue(T* ptr) { return (ptr != nullptr) ? *ptr : nullptr; }
// ================================================================================================
amd::Memory* getMemoryObject(const void* ptr, size_t& offset, size_t size) {
auto memObj = amd::MemObjMap::FindMemObj(ptr, &offset);
+1 -1
Просмотреть файл
@@ -131,7 +131,7 @@ Settings::Settings() {
: HIP_FORCE_DEV_KERNARG;
limit_blit_wg_ = 16;
DEBUG_CLR_GRAPH_PACKET_CAPTURE = false; // disable graph performance optimizations for PAL
DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING = 0; // disable graph performance optimizations for PAL
}
bool Settings::create(const Pal::DeviceProperties& palProp,
+2 -2
Просмотреть файл
@@ -352,7 +352,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
}
}
// The hsa copy api would result in a dirty cache state
// The ROCR copy api guarantees coherency after the copy
gpu().setFenceDirty(false);
return true;
}
@@ -590,7 +590,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
if (status == HSA_STATUS_SUCCESS) {
gpu().addSystemScope();
// The hsa copy api would result in a dirty cache state
// The ROCR copy api guarantees coherency after the copy
gpu().setFenceDirty(false);
} else {
gpu().Barriers().ResetCurrentSignal();
+112 -74
Просмотреть файл
@@ -553,8 +553,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
if (HSA_STATUS_SUCCESS != result) {
LogError("hsa_amd_signal_async_handler() failed to set the handler!");
} else {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
prof_signal->signal_.handle, prof_signal);
ClPrint(amd::LOG_INFO, amd::LOG_SIG,
"Set Handler: handle(0x%lx), timestamp(%p), blocking CB=%d",
prof_signal->signal_.handle, prof_signal,
ts->command().Callback() != nullptr && ts->GetBlocking());
}
}
}
@@ -1009,7 +1011,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
// Check for queue full and wait if needed.
uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
fence_dirty_ = true;
setFenceDirty(true);
if (addSystemScope_) {
header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
@@ -1024,14 +1026,14 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
// Reset fence_dirty_ flag if we submit a packet with system scopes
if (expected_fence_state == amd::Device::kCacheStateSystem) {
fence_dirty_ = false;
setFenceDirty(false);
}
// Dirty optimization to save on consequent dispatch packets which have requested flushes
if (fence_state_ == amd::Device::kCacheStateSystem &&
expected_fence_state == amd::Device::kCacheStateSystem) {
header = dispatchPacketHeader_;
fence_dirty_ = true;
setFenceDirty(true);
}
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
@@ -1076,7 +1078,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
if (header != 0) {
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
}
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], private_seg_size=%u, "
@@ -1204,12 +1206,18 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
amd::Os::yield();
}
fence_dirty_ = true;
setFenceDirty(true);
// Save header of first packet in this batch
AqlPacket* firstPacket = packets[processedPackets];
uint16_t firstPacketHeader = firstPacket->header;
uint16_t firstPacketRest = firstPacket->setup;
// Separate header for doorbell ring that can be modified
uint16_t doorbellHeader = firstPacketHeader;
// Save header of last packet in this batch (if different from first)
AqlPacket* lastPacket = packets[processedPackets + batchSize - 1];
uint16_t lastPacketHeader = lastPacket->header;
// Process batchSize packets
for (size_t i = 0; i < batchSize; ++i) {
@@ -1217,8 +1225,6 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
uint64_t index = startIndex + i;
AqlPacket* packet = packets[packetIndex];
uint16_t header = packet->header;
bool attachSignal = timestamp_ != nullptr || attach_signal;
@@ -1247,84 +1253,105 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
// For first packet in batch, invalidate header before writing
if (i == 0) {
bool isFirstPacket = (i == 0);
bool isLastPacket = (i == batchSize - 1);
if (isFirstPacket) {
if (addSystemScope_) {
// Add system scope on the acq on first packet
firstPacketHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
firstPacketHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
// Add system scope on the acq on first packet (modify doorbell header)
doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
}
// Invalidate the header of the first packet in the batch
packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
}
// Copy the packet and then write the valid of the first packet
*aql_loc = *packet;
// Restore the header of the first packet
packet->header = firstPacketHeader;
} else {
// For the end packet in batch set flags
if (i == batchSize - 1) {
if (addSystemScope_) {
// Add system scope on the release on last packet
// For the end packet in batch set flags
if (isLastPacket) {
if (addSystemScope_) {
// If batch has only 1 packet, update doorbell header for release scope
// (packet->header is already invalid, so don't modify it)
if (batchSize == 1) {
doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
} else {
// Add system scope on the release on last packet (different from first)
packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
addSystemScope_ = false;
}
auto expected_fence_state =
extractAqlBits(packet->header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
// Reset fence_dirty_ flag if we submit a packet with system scopes
if (expected_fence_state == amd::Device::kCacheStateSystem) {
fence_dirty_ = false;
}
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
addSystemScope_ = false;
}
// Copy the packet to the queue
*aql_loc = *packet;
// Use doorbellHeader for single packet batch (packet->header is invalid),
// else use packet->header
uint16_t headerForFenceState = (batchSize == 1) ? doorbellHeader : packet->header;
auto expected_fence_state =
extractAqlBits(headerForFenceState, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
// Reset fence_dirty_ flag if we submit a packet with system scopes
if (expected_fence_state == amd::Device::kCacheStateSystem) {
setFenceDirty(false);
}
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
}
// Copy the packet to the queue
*aql_loc = *packet;
// Print kernel name for kernel dispatch packets
if (kernelNames && packetIndex < kernelNames->size()) {
// Use doorbellHeader for first packet (packet->header is invalid), else use packet->header
uint16_t headerForPrint = isFirstPacket ? doorbellHeader : packet->header;
uint8_t packetType =
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2, "Graph ShaderName : %s, device id : %u",
(*kernelNames)[packetIndex].c_str(), dev().index());
ClPrint(
amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
"private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
"rptr=%u, wptr=%u",
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
packet->setup, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
"private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
"rptr=%u, wptr=%u",
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, headerForPrint, packetType,
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_BARRIER,
HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
packet->setup,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
}
}
// Restore the header of the first packet
if (isFirstPacket) {
packet->header = firstPacketHeader;
}
// Restore the header of the last packet (if different from first)
if (isLastPacket && batchSize > 1) {
packet->header = lastPacketHeader;
}
}
// Write valid header for the first packet in the batch
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask];
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), firstPacketHeader, firstPacketRest);
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), doorbellHeader, firstPacketRest);
// Ring doorbell for this batch
Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, startIndex);
@@ -1367,8 +1394,7 @@ bool VirtualGPU::dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
dispatchBlockingWait();
// Add all kernel names in bulk
vcmd->addKernelNames(kernelNames);
vcmd->setKernelNamesRef(&kernelNames);
// Dispatch all packets with a single doorbell ring
// Cast packets vector to AQL packets vector on the fly
@@ -1428,7 +1454,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
uint64_t read = Hsa::queue_load_read_index_relaxed(gpu_queue_);
fence_dirty_ = true;
setFenceDirty(true);
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
if (!skipSignal && (signal.handle == 0)) {
@@ -1443,7 +1469,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
// Reset fence_dirty_ flag if we submit a barrier with system scopes
if (cache_state == amd::Device::kCacheStateSystem) {
fence_dirty_ = false;
setFenceDirty(false);
}
while ((index - Hsa::queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1453,7 +1479,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, 0);
Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d,"
" release=%d), "
"dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], completion_signal=0x%zx, "
@@ -1512,7 +1538,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
}
}
fence_dirty_ = true;
setFenceDirty(true);
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
@@ -1527,7 +1553,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
// Reset fence_dirty_ flag if we submit a barrier
if (cache_state == amd::Device::kCacheStateSystem) {
fence_dirty_ = false;
setFenceDirty(false);
}
uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
@@ -1543,7 +1569,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, BarrierValue Header = 0x%x AmdFormat = 0x%x "
"(type=%d, barrier=%d, acquire=%d, release=%d), "
"signal=0x%zx, value = 0x%llx mask = 0x%llx cond: %s, completion_signal=0x%zx, "
@@ -1576,7 +1602,7 @@ void VirtualGPU::ResetQueueStates() {
// ================================================================================================
bool VirtualGPU::releaseGpuMemoryFence(bool skip_cpu_wait) {
if (hasPendingDispatch_ || !Barriers().IsExternalSignalListEmpty()) {
if (hasPendingDispatch_ || isFenceDirty() || !Barriers().IsExternalSignalListEmpty()) {
// Dispatch barrier packet into the queue
dispatchBarrierPacket(kBarrierPacketHeader);
hasPendingDispatch_ = false;
@@ -1944,6 +1970,17 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
} else {
// Assume serialization on the same queue...
}
// Check if the waiting event's queue has a dirty fence and propagate it
if (!isFenceDirty()) {
amd::Command* wait_cmd = static_cast<amd::Command*>(*it);
if (wait_cmd->queue() != nullptr && wait_cmd->queue() != command.queue()) {
device::VirtualDevice* wait_vdev = wait_cmd->queue()->vdev();
if (wait_vdev != nullptr && wait_vdev->isFenceDirty()) {
setFenceDirty(true);
}
}
}
}
}
}
@@ -3688,7 +3725,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
if (isGraphCapture) {
argBuffer = command_->getGraphKernArg(gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment(), dev().index());
command_->SetKernelName(gpuKernel.getDemangledName().c_str());
command_->SetKernelName(gpuKernel.getDemangledName());
} else {
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN,
"KernargSegmentByteSize = %lu "
@@ -3916,6 +3953,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
if (timestamp_ != nullptr) {
const Settings& settings = dev().settings();
int32_t releaseFlags = vcmd.getCommandEntryScope();
if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
+2 -2
Просмотреть файл
@@ -448,8 +448,8 @@ class VirtualGPU : public device::VirtualDevice {
amd::Command* command() const { return command_; }
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void setFenceDirty(bool state) { fence_dirty_ = state; }
bool isFenceDirty() const { return fence_dirty_.load(std::memory_order_acquire); }
void setFenceDirty(bool state) { fence_dirty_.store(state, std::memory_order_release); }
void WaitCompleteSignal(hsa_signal_t signal);
void HiddenHeapInit();
+9 -1
Просмотреть файл
@@ -1383,6 +1383,7 @@ class AccumulateCommand : public Command {
private:
//! Kernel names and timestamps list for activity profiling
std::vector<std::string> kernelNames_;
const std::vector<std::string>* kernelNamesRef_ = nullptr;
std::vector<std::pair<uint64_t, uint64_t>> tsList_;
public:
@@ -1399,13 +1400,20 @@ class AccumulateCommand : public Command {
kernelNames_.insert(kernelNames_.end(), kernelNames.begin(), kernelNames.end());
}
//! Set kernel names by reference
void setKernelNamesRef(const std::vector<std::string>* kernelNames) {
kernelNamesRef_ = kernelNames;
}
//! Add kernel timestamp to the list if available
void addTimestamps(uint64_t startTs, uint64_t endTs) {
tsList_.push_back(std::make_pair(startTs, endTs));
}
//! Return the kernel names
const std::vector<std::string>& getKernelNames() const { return kernelNames_; }
const std::vector<std::string>& getKernelNames() const {
return kernelNamesRef_ != nullptr ? *kernelNamesRef_ : kernelNames_;
}
//! Return the kernel timestamps
const std::vector<std::pair<uint64_t, uint64_t>>& getTimestamps() const { return tsList_; }
+2 -2
Просмотреть файл
@@ -237,8 +237,6 @@ release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \
"Set initial heap size for device malloc.") \
release(bool, HIP_FORCE_DEV_KERNARG, true, \
"Force device mem for kernel args.") \
release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \
"Enable/Disable graph packet capturing") \
release(bool, GPU_DEBUG_ENABLE, false, \
"Enables collection of extra info for debugger at some perf cost") \
release(cstring, HIPRTC_COMPILE_OPTIONS_APPEND, "", \
@@ -255,6 +253,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
"Forces the number of streams for the graph parallel execution") \
release(uint, DEBUG_HIP_GRAPH_BATCH_SIZE, 256, \
"Number of graph nodes to batch at a time") \
release(uint, DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING, 1, \
"0 = Disable, 1 = Enable, 2 = Force") \
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
"Blocks synchronization on CPU until the callback processing is done")\
release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \