clr: Use graph segment scheduling to process HIP Graphs (#1372)
* clr: Use graph segment scheduling to process HIP Graphs * Add a broader path to use capture packet capture for all topologies * Refactor code * Use DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING to toggle new vs classic path, Enabled by default * clr: Few fixes and improvements * clr: Detect complex graphs to take classic path * Use DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING=2 to force segment scheduling path * clr: Fix a cornercase stack corruption * clr: Track commands of segments instead of snapshots * clr: Fix Batch dispatch logic * Track fence_dirty_ flag for command of other streams * Dependency resolution markers can now accomodate dirty fence on cross streams --------- Co-authored-by: Ioannis Assiouras <Ioannis.Assiouras@amd.com> Co-authored-by: Godavarthy Surya, Anusha <agodavar@amd.com>
이 커밋은 다음에 포함됨:
@@ -1425,9 +1425,9 @@ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraph
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
@@ -1523,12 +1523,14 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
graph->clone(*pGraphExec, true);
|
||||
(*pGraphExec)->ScheduleNodes();
|
||||
if (false == (*pGraphExec)->TopologicalOrder()) {
|
||||
|
||||
hipError_t scheduleStatus = (*pGraphExec)->ScheduleNodes();
|
||||
if (scheduleStatus != hipSuccess) {
|
||||
delete *pGraphExec;
|
||||
return hipErrorInvalidValue;
|
||||
*pGraphExec = nullptr;
|
||||
return scheduleStatus;
|
||||
}
|
||||
graph->SetGraphInstantiated(true);
|
||||
|
||||
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
|
||||
static int i = 1;
|
||||
std::string filename =
|
||||
@@ -1538,7 +1540,10 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
|
||||
LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str());
|
||||
}
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
|
||||
graph->SetGraphInstantiated(true);
|
||||
|
||||
if ((*pGraphExec)->IsSegmentSchedulingEnabled()) {
|
||||
(*pGraphExec)->SetKernelArgManager(new hip::GraphKernelArgManager());
|
||||
}
|
||||
return (*pGraphExec)->Init();
|
||||
@@ -1555,7 +1560,7 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
|
||||
if (status == hipSuccess) {
|
||||
*pGraphExec = reinterpret_cast<hipGraphExec_t>(ge);
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
HIP_RETURN(status, ReturnPtrValue(pGraphExec));
|
||||
}
|
||||
|
||||
hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
|
||||
@@ -1574,7 +1579,7 @@ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t g
|
||||
hip::GraphExec* ge;
|
||||
hipError_t status = ihipGraphInstantiate(&ge, reinterpret_cast<hip::Graph*>(graph), flags);
|
||||
*pGraphExec = reinterpret_cast<hipGraphExec_t>(ge);
|
||||
HIP_RETURN(status);
|
||||
HIP_RETURN(status, ReturnPtrValue(pGraphExec));
|
||||
}
|
||||
|
||||
hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
|
||||
@@ -1609,7 +1614,7 @@ hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
|
||||
HIP_RETURN(hipSuccess);
|
||||
HIP_RETURN(hipSuccess, ReturnPtrValue(pGraphExec));
|
||||
}
|
||||
|
||||
hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) {
|
||||
@@ -1820,9 +1825,9 @@ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
@@ -1871,9 +1876,9 @@ hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
@@ -1931,9 +1936,9 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
@@ -2008,13 +2013,18 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
if (reinterpret_cast<hip::ChildGraphNode*>(clonedNode)->GetGraphCaptureStatus()) {
|
||||
|
||||
hip::ChildGraphNode* childNode = reinterpret_cast<hip::ChildGraphNode*>(clonedNode);
|
||||
|
||||
// After SetParams updates node parameters in-place, we need to update the cached AQL packets
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled() || childNode->GetGraphCaptureStatus()) {
|
||||
std::vector<hip::GraphNode*> childGraphNodes;
|
||||
reinterpret_cast<hip::ChildGraphNode*>(clonedNode)->TopologicalOrder(childGraphNodes);
|
||||
childNode->TopologicalOrder(childGraphNodes);
|
||||
for (std::vector<hip::GraphNode*>::size_type i = 0; i != childGraphNodes.size(); i++) {
|
||||
if (childGraphNodes[i]->GraphCaptureEnabled()) {
|
||||
status = reinterpret_cast<hip::ChildGraphNode*>(clonedNode)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
|
||||
status =
|
||||
childNode->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
@@ -2414,9 +2424,9 @@ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec,
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
@@ -2497,9 +2507,9 @@ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hi
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(clonedNode));
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
@@ -2734,10 +2744,11 @@ hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph,
|
||||
*updateResult_out = hipGraphExecUpdateErrorNotSupported;
|
||||
}
|
||||
HIP_RETURN(hipErrorGraphExecUpdateFailure);
|
||||
} else if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && newGraphNodes[i]->GraphCaptureEnabled()) {
|
||||
status =
|
||||
reinterpret_cast<hip::GraphExec*>(hGraphExec)
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(oldGraphExecNodes[i]));
|
||||
} else {
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled() && newGraphNodes[i]->GraphCaptureEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(oldGraphExecNodes[i]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
*hErrorNode_out = reinterpret_cast<hipGraphNode_t>(newGraphNodes[i]);
|
||||
@@ -3091,12 +3102,16 @@ hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNod
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
clonedNode->SetEnabled(isEnabled);
|
||||
// Update packet batches when node is enabled/disabled
|
||||
hipError_t status = graphExec->UpdatePacketBatchesForNodeEnableDisable(clonedNode, isEnabled != 0);
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
|
||||
hipError_t status = hipSuccess;
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
// Update packet batches when node is enabled/disabled
|
||||
status = graphExec->UpdatePacketBatchesForNodeEnableDisable(clonedNode, isEnabled != 0);
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
}
|
||||
HIP_RETURN(hipSuccess);
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
|
||||
hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
@@ -3449,8 +3464,9 @@ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGrap
|
||||
if (status != hipSuccess) {
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)->UpdateAQLPacket(clonedNode);
|
||||
auto graphExec = reinterpret_cast<hip::GraphExec*>(hGraphExec);
|
||||
if (graphExec->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExec->UpdateAQLPacket(clonedNode);
|
||||
}
|
||||
HIP_RETURN(status);
|
||||
}
|
||||
@@ -3572,8 +3588,9 @@ hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t no
|
||||
return status;
|
||||
}
|
||||
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
status = reinterpret_cast<hip::GraphExec*>(graphExec)->UpdateAQLPacket(clonedNode);
|
||||
auto graphExecPtr = reinterpret_cast<hip::GraphExec*>(graphExec);
|
||||
if (graphExecPtr->IsSegmentSchedulingEnabled()) {
|
||||
status = graphExecPtr->UpdateAQLPacket(clonedNode);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
Diff 로드
@@ -320,6 +320,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
const std::vector<Node>& GetDependencies() const { return dependencies_; }
|
||||
/// Update graph node dependecies
|
||||
void SetDependencies(std::vector<Node>& dependencies) {
|
||||
dependencies_.clear();
|
||||
for (auto entry : dependencies) {
|
||||
dependencies_.push_back(entry);
|
||||
}
|
||||
@@ -366,6 +367,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
const std::vector<Node>& GetEdges() const { return edges_; }
|
||||
/// Updates graph node children
|
||||
void SetEdges(std::vector<Node>& edges) {
|
||||
edges_.clear();
|
||||
for (auto entry : edges) {
|
||||
edges_.push_back(entry);
|
||||
}
|
||||
@@ -425,19 +427,10 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
}
|
||||
unsigned int GetEnabled() const { return isEnabled_; }
|
||||
void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
|
||||
// Returns true if capture is enabled for the current node.
|
||||
|
||||
// Base implementation returns false; specific node types should override.
|
||||
virtual bool GraphCaptureEnabled() {
|
||||
bool isGraphCapture = false;
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
switch (GetType()) {
|
||||
case hipGraphNodeTypeMemset:
|
||||
isGraphCapture = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isGraphCapture;
|
||||
return false;
|
||||
}
|
||||
virtual void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override {
|
||||
out << "[";
|
||||
@@ -454,6 +447,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
out << GetLabel(flag);
|
||||
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
|
||||
out << "\nStreamId:" << stream_id_;
|
||||
out << "\nSegmentId:" << segment_id_;
|
||||
out << "\nSignalIsRequired: " << ((signal_is_required_) ? "true" : "false");
|
||||
out << "\nDeviceId:" << dev_id_;
|
||||
}
|
||||
@@ -479,6 +473,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
size_t inDegree_; //!< count of in coming edges (@todo: remove, it's dependencies_.size())
|
||||
size_t outDegree_; //!< count of outgoing edges (@todo: remove, it's edges_.size())
|
||||
int32_t stream_id_ = -1; //! Stream ID on which this node will be executed
|
||||
int32_t segment_id_ = -1; //! Segment ID on which this node will be executed
|
||||
int32_t launch_id_ = -1; //! Launch ID of this node in the entire graph execution sequence
|
||||
static int nextID;
|
||||
Graph* parentGraph_;
|
||||
@@ -556,6 +551,8 @@ class Graph {
|
||||
graphSet_.insert(this);
|
||||
mem_pool_ = device->GetGraphMemoryPool();
|
||||
graphInstantiated_ = false;
|
||||
// Initialize per-graph segment scheduling flag from global env var
|
||||
use_segment_scheduling_ = DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING;
|
||||
roots_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
|
||||
leafs_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
|
||||
wait_order_.resize(DEBUG_HIP_FORCE_GRAPH_QUEUES);
|
||||
@@ -568,7 +565,7 @@ class Graph {
|
||||
}
|
||||
}
|
||||
}
|
||||
~Graph() {
|
||||
virtual ~Graph() {
|
||||
for (auto node : vertices_) {
|
||||
delete node;
|
||||
}
|
||||
@@ -639,6 +636,8 @@ class Graph {
|
||||
const std::vector<Node>& GetTopoOrder() const { return topoOrder_; }
|
||||
/// returns all the edges in the graph
|
||||
std::vector<std::pair<Node, Node>> GetEdges() const;
|
||||
/// Returns whether segment scheduling is enabled for this graph
|
||||
bool IsSegmentSchedulingEnabled() const { return use_segment_scheduling_; }
|
||||
// returns the original graph ptr if cloned
|
||||
const Graph* getOriginalGraph() const { return pOriginalGraph_; }
|
||||
// Add user obj resource to graph
|
||||
@@ -679,7 +678,43 @@ class Graph {
|
||||
);
|
||||
|
||||
//! Schedules all nodes in the graph into different streams
|
||||
void ScheduleNodes();
|
||||
hipError_t ScheduleNodes();
|
||||
|
||||
// Hierarchical path structure for child graph support
|
||||
struct HierarchicalPath {
|
||||
std::vector<Node> nodes; //!< Nodes in this path (at this level only)
|
||||
Node child_graph_node = nullptr; //!< Reference to child graph node if present in path
|
||||
int child_graph_paths_index = -1; //!< Index into child_graph_paths (-1 if no child)
|
||||
int device_id = -1; //!< Device ID for this path
|
||||
};
|
||||
|
||||
// Structure to store execution paths for a graph and its children hierarchically
|
||||
struct GraphExecutionPaths {
|
||||
Graph* graph_ptr = nullptr; //!< Pointer to the graph this belongs to
|
||||
std::vector<HierarchicalPath> paths; //!< All execution paths at this level only
|
||||
std::vector<GraphExecutionPaths> child_graph_paths; //!< Child graph execution paths
|
||||
};
|
||||
|
||||
//! Schedules nodes into batches for optimized execution
|
||||
hipError_t ScheduleNodesIntoBatches();
|
||||
|
||||
//! Find execution paths hierarchically, keeping child graphs separate
|
||||
GraphExecutionPaths FindExecutionPathsHierarchical();
|
||||
|
||||
//! Recursively find all paths from a node with hierarchical child graph handling
|
||||
void FindPathsRecursiveHierarchical(Node node,
|
||||
std::vector<Node>& current_path,
|
||||
std::unordered_set<unsigned int>& visited,
|
||||
GraphExecutionPaths& graph_paths);
|
||||
|
||||
//! Create segments from hierarchical execution paths
|
||||
void CreateSegmentsFromPaths(const GraphExecutionPaths& exec_paths);
|
||||
|
||||
//! Resolve dependencies between segments
|
||||
void ResolveSegmentDependencies();
|
||||
|
||||
//! Calculate dependency levels for segments using topological sort
|
||||
void CalculateSegmentTopoDependencyLevels();
|
||||
|
||||
//! Runs one node on the assigned stream
|
||||
bool RunOneNode(Node node); //!< Node for the execution on GPU
|
||||
@@ -785,8 +820,35 @@ class Graph {
|
||||
//!< during multi-device graph execution scheduling.
|
||||
std::unordered_map<int, std::set<int>> streams_dev_ids_;
|
||||
int instantiateDeviceId_ = -1;
|
||||
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
|
||||
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
|
||||
std::vector<Node> topoOrder_;
|
||||
|
||||
// Segment dependency structures
|
||||
struct Segment {
|
||||
int id = -1;
|
||||
int stream_id = -1; // Assigned stream for this segment
|
||||
int dependency_level = -1; // Topological level (0 = root, 1 = depends on root, etc.)
|
||||
std::vector<Node> nodes;
|
||||
std::vector<int> segment_ids_dependencies; // Segments this segment depends on (within same graph)
|
||||
std::vector<int> segment_ids_edges; // Segments that depend on this segment (within same graph)
|
||||
Node first_node = nullptr;
|
||||
Node last_node = nullptr;
|
||||
|
||||
// Hierarchical child graph information
|
||||
Graph* child_graph_ptr = nullptr; // Direct pointer to child graph for quick access
|
||||
};
|
||||
|
||||
//! Segment information for batch scheduling
|
||||
std::vector<Segment> segments_;
|
||||
//! Map of node to segment ID
|
||||
std::unordered_map<Node, int> node_to_segment_id_;
|
||||
//! Maximum dependency level in the segment graph
|
||||
int max_dependency_level_ = -1;
|
||||
//!< Map of dependency level to list of segment IDs at that level
|
||||
std::unordered_map<int, std::vector<int>> segments_per_level_;
|
||||
|
||||
std::unordered_map<Node, Node> clonedNodes_;
|
||||
|
||||
private:
|
||||
friend class GraphExec;
|
||||
std::vector<Node> vertices_;
|
||||
@@ -807,7 +869,10 @@ class Graph {
|
||||
hip::MemoryPool* mem_pool_; //!< Memory pool, associated with this graph
|
||||
std::unordered_set<GraphNode*> capturedNodes_;
|
||||
bool graphInstantiated_;
|
||||
std::unordered_map<Node, Node> clonedNodes_;
|
||||
//!< Per-graph flag to control segment scheduling
|
||||
//!< Can be disabled per-graph for complex graphs that benefit from classic path
|
||||
bool use_segment_scheduling_;
|
||||
|
||||
//! Map of device ID to vector of streams allocated for that device during graph execution.
|
||||
//! Each device may require multiple streams to handle parallel execution of graph nodes.
|
||||
std::unordered_map<int, std::vector<hip::Stream*>> streams_dev_;
|
||||
@@ -815,6 +880,17 @@ class Graph {
|
||||
//! Map tracking the maximum number of concurrent streams required per device for graph execution.
|
||||
//! Key: device ID, Value: maximum number of streams needed for that device
|
||||
std::unordered_map<int, int> max_streams_dev_;
|
||||
|
||||
// Batch-based scheduling structures
|
||||
struct Batch {
|
||||
int id = -1;
|
||||
int stream_id = 0;
|
||||
std::vector<Node> nodes;
|
||||
std::vector<int> incoming_stream_ids;
|
||||
Node last_node = nullptr;
|
||||
};
|
||||
|
||||
std::vector<Batch> batches_;
|
||||
};
|
||||
|
||||
class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
@@ -822,6 +898,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
static std::unordered_set<GraphExec*> graphExecSet_;
|
||||
static amd::Monitor graphExecSetLock_;
|
||||
static amd::Monitor graphExecStreamCreateLock_;
|
||||
bool graph_dumped_ = false;
|
||||
GraphExec(uint64_t flags = 0)
|
||||
: ReferenceCountedObject(), Graph(hip::getCurrentDevice()), flags_(flags) {
|
||||
amd::ScopedLock lock(graphExecSetLock_);
|
||||
@@ -832,20 +909,20 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
for (auto streams : parallel_streams_) {
|
||||
for (auto stream : streams.second) {
|
||||
if (stream != nullptr) {
|
||||
stream->finish();
|
||||
constexpr bool kForceDestroy = true;
|
||||
hip::Stream::Destroy(stream, kForceDestroy);
|
||||
}
|
||||
}
|
||||
}
|
||||
parallel_streams_.clear();
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
if (IsSegmentSchedulingEnabled()) {
|
||||
if (kernArgManager_ != nullptr) {
|
||||
kernArgManager_->release();
|
||||
}
|
||||
}
|
||||
|
||||
packetBatches_.clear();
|
||||
nodeCaptureStatus_.clear();
|
||||
segmentBatches_.clear();
|
||||
}
|
||||
|
||||
Node GetClonedNode(Node node) {
|
||||
@@ -885,9 +962,13 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
|
||||
hipError_t CaptureAndFormPacketsForGraph();
|
||||
void GetKernelArgSizeForGraph(std::unordered_map<int, size_t>& kernArgSizeForGraph);
|
||||
hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
|
||||
//! Enqueue a multi-device linear graph for execution
|
||||
hipError_t EnqueueMultiDeviceLinearGraph(hip::Stream* hip_stream);
|
||||
|
||||
amd::Command* EnqueueSegmentedGraph(hip::Stream* launch_stream,
|
||||
const std::vector<hip::Stream*>& streams,
|
||||
hipError_t* out_status = nullptr);
|
||||
hipError_t EnqueueSegment(const Segment& segment, hip::Stream* stream,
|
||||
amd::AccumulateCommand* accumulate);
|
||||
|
||||
bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
|
||||
//! Update streams for the graph execution with launch stream from application
|
||||
void UpdateStreams(hip::Stream* launch_stream);
|
||||
@@ -895,20 +976,41 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
//! This method analyzes the stream-to-device mappings and recursively processes
|
||||
//! child graphs to determine the maximum concurrent streams needed per device
|
||||
void FindStreamsReqPerDev();
|
||||
//! Find the number of streams required per device for packet engine mode
|
||||
//! This method analyzes segments to determine per-device stream requirements
|
||||
void FindStreamsReqPerDevForSegments();
|
||||
//! Get the parallel streams map for synchronization before destruction
|
||||
const std::unordered_map<int, std::vector<hip::Stream*>>& GetParallelStreams() const {
|
||||
return parallel_streams_;
|
||||
}
|
||||
|
||||
protected:
|
||||
//! Assign streams to segments at a given dependency level
|
||||
void AssignStreamsToSegments(
|
||||
const std::vector<int>& segments_at_level,
|
||||
hip::Stream* launch_stream,
|
||||
const std::vector<hip::Stream*>& streams,
|
||||
std::unordered_map<int, hip::Stream*>& segment_to_stream);
|
||||
|
||||
//! parallel streams per device
|
||||
std::unordered_map<int, std::vector<hip::Stream*>> parallel_streams_;
|
||||
uint64_t flags_ = 0;
|
||||
GraphKernelArgManager* kernArgManager_ = nullptr; //!< Kernel Arg manager for graph.
|
||||
bool hasHiddenHeap_ = false; //!< Hidden heap indicator for Kernel node
|
||||
bool repeatLaunch_ = false;
|
||||
//!< Track last launch stream to avoid redundant UpdateStreams
|
||||
hip::Stream* lastLaunchStream_ = nullptr;
|
||||
|
||||
// PacketBatch structure
|
||||
struct PacketBatch {
|
||||
// Main dispatch vectors - always ready for batch dispatch
|
||||
std::vector<uint8_t*> dispatchPackets;
|
||||
std::vector<std::string> dispatchKernelNames;
|
||||
|
||||
// Cached filtered lists - built on-demand when nodes are disabled
|
||||
std::vector<uint8_t*> enabledPackets;
|
||||
std::vector<std::string> enabledKernelNames;
|
||||
|
||||
// Node tracking
|
||||
struct NodeRange {
|
||||
size_t startIndex; // Start index in dispatchPackets
|
||||
@@ -921,13 +1023,22 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
PacketBatch() {}
|
||||
// O(1) enable/disable operations - just update state
|
||||
void setEnabled(GraphNode* node, bool enabled);
|
||||
// Rebuild cached filtered lists if cache is stale
|
||||
void rebuildFilteredLists();
|
||||
};
|
||||
|
||||
//! Structure linking packet batches to segments
|
||||
struct SegmentBatch {
|
||||
int segment_id; // Segment this batch belongs to
|
||||
std::vector<bool> node_capture_status; // Capture status for each node in this segment
|
||||
std::vector<PacketBatch> packet_batches; // All packet batches for this segment
|
||||
|
||||
SegmentBatch(int seg_id) : segment_id(seg_id) {}
|
||||
};
|
||||
|
||||
//! Batches of accumulated packets and kernel names for batch dispatch optimization
|
||||
//! Each batch contains packets from consecutive captured nodes
|
||||
std::vector<PacketBatch> packetBatches_;
|
||||
//! Track which nodes were successfully captured (true) vs need individual execution (false)
|
||||
std::vector<bool> nodeCaptureStatus_;
|
||||
//! Map from segment ID to SegmentBatch for O(1) lookup
|
||||
std::unordered_map<int, SegmentBatch> segmentBatches_;
|
||||
};
|
||||
|
||||
class ChildGraphNode : public GraphNode, public GraphExec {
|
||||
@@ -950,6 +1061,13 @@ class ChildGraphNode : public GraphNode, public GraphExec {
|
||||
|
||||
bool GetGraphCaptureStatus() { return graphCaptureStatus_; }
|
||||
|
||||
bool GraphCaptureEnabled() override {
|
||||
if (IsSegmentSchedulingEnabled()) {
|
||||
return graphCaptureStatus_;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<Node>& GetChildGraphNodeOrder() { return topoOrder_; }
|
||||
|
||||
void SetStream(hip::Stream* stream) override { stream_ = stream; }
|
||||
@@ -959,9 +1077,26 @@ class ChildGraphNode : public GraphNode, public GraphExec {
|
||||
}
|
||||
|
||||
void EnqueueCommands(hip::Stream* stream) override {
|
||||
if (graphCaptureStatus_) {
|
||||
hipError_t status = EnqueueGraphWithSingleList(stream);
|
||||
// Note: For segmented graphs, EnqueueSegment now calls EnqueueSegmentedGraph recursively
|
||||
// This method is kept as a fallback for non-segmented execution or legacy paths
|
||||
|
||||
if (graphCaptureStatus_ || !segments_.empty()) {
|
||||
// Use hierarchical segment-based enqueue via EnqueueSegmentedGraph
|
||||
// Use this child graph's own parallel_streams_, so pass empty vector
|
||||
hipError_t status = hipSuccess;
|
||||
amd::Command* last_cmd = EnqueueSegmentedGraph(stream, {}, &status);
|
||||
|
||||
if (last_cmd != nullptr) {
|
||||
// This is a fallback path - we don't need to track the command
|
||||
last_cmd->release();
|
||||
}
|
||||
|
||||
if (status != hipSuccess) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_CODE,
|
||||
"[hipGraph] ChildGraphNode::EnqueueCommands failed with status=%d", status);
|
||||
}
|
||||
} else if (max_streams_ == 1) {
|
||||
// Legacy topological order execution for non-segmented graphs
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
topoOrder_[i]->SetStream(stream_);
|
||||
hipError_t status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
@@ -1054,6 +1189,7 @@ class GraphKernelNode : public GraphNode {
|
||||
out << GetLabel(flag);
|
||||
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
|
||||
out << "StreamId:" << stream_id_;
|
||||
out << "\nSegmentId:" << segment_id_;
|
||||
out << "\nSignalIsRequired: " << ((signal_is_required_) ? "true" : "false");
|
||||
out << "\nDeviceId:" << dev_id_;
|
||||
}
|
||||
@@ -1137,7 +1273,7 @@ class GraphKernelNode : public GraphNode {
|
||||
}
|
||||
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
|
||||
amd::Kernel* kernel = function->kernel();
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
|
||||
auto device = g_devices[dev_id_]->devices()[0];
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel->getDeviceKernel(*device));
|
||||
kernargSegmentByteSize_ = devKernel->KernargSegmentByteSize();
|
||||
@@ -1270,6 +1406,11 @@ class GraphKernelNode : public GraphNode {
|
||||
GraphNode* clone() const override { return new GraphKernelNode(*this); }
|
||||
|
||||
hipError_t CreateCommand(hip::Stream* stream) override {
|
||||
// Clear commands_ first, even if node is disabled
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
if (!isEnabled_) {
|
||||
return hipSuccess;
|
||||
}
|
||||
@@ -1280,14 +1421,10 @@ class GraphKernelNode : public GraphNode {
|
||||
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
|
||||
amd::Kernel* kernel = function->kernel();
|
||||
amd::ScopedLock lock(function->dflock_);
|
||||
hipError_t status = validateKernelParams(&kernelParams_, func, dev_id_);
|
||||
status = validateKernelParams(&kernelParams_, func, dev_id_);
|
||||
if (hipSuccess != status) {
|
||||
return status;
|
||||
}
|
||||
status = GraphNode::CreateCommand(stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
commands_.reserve(1);
|
||||
amd::Command* command;
|
||||
uint32_t flags = 0;
|
||||
@@ -1471,14 +1608,13 @@ class GraphKernelNode : public GraphNode {
|
||||
}
|
||||
|
||||
virtual bool GraphCaptureEnabled() override {
|
||||
bool isGraphCapture = false;
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
|
||||
// Disable capture for cooperative kernels
|
||||
if (!coopKernel_) {
|
||||
isGraphCapture = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return isGraphCapture;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1500,15 +1636,16 @@ class GraphMemcpyNode : public GraphNode {
|
||||
GraphNode* clone() const override { return new GraphMemcpyNode(*this); }
|
||||
|
||||
virtual hipError_t CreateCommand(hip::Stream* stream) override {
|
||||
// Clear commands_ first, even if node is disabled
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
if (!isEnabled_ ||
|
||||
((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
|
||||
IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr))) {
|
||||
return hipSuccess;
|
||||
}
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
commands_.reserve(1);
|
||||
amd::Command* command;
|
||||
status = ihipMemcpy3DCommand(command, ©Params_, stream);
|
||||
@@ -1632,17 +1769,16 @@ class GraphMemcpyNode : public GraphNode {
|
||||
}
|
||||
}
|
||||
virtual bool GraphCaptureEnabled() override {
|
||||
bool isGraphCapture = false;
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
|
||||
switch (copyParams_.kind) {
|
||||
case hipMemcpyDeviceToDevice:
|
||||
isGraphCapture = true;
|
||||
return true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isGraphCapture;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1705,14 +1841,15 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
GraphNode* clone() const override { return new GraphMemcpyNode1D(*this); }
|
||||
|
||||
virtual hipError_t CreateCommand(hip::Stream* stream) override {
|
||||
if (!isEnabled_ ||
|
||||
((kind_ == hipMemcpyHostToHost || kind_ == hipMemcpyDefault) && IsHtoHMemcpy(dst_, src_))) {
|
||||
return hipSuccess;
|
||||
}
|
||||
// Clear commands_ first, even if node is disabled
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
if (!isEnabled_ ||
|
||||
((kind_ == hipMemcpyHostToHost || kind_ == hipMemcpyDefault) && IsHtoHMemcpy(dst_, src_))) {
|
||||
return hipSuccess;
|
||||
}
|
||||
commands_.reserve(1);
|
||||
amd::Command* command = nullptr;
|
||||
if (!AMD_DIRECT_DISPATCH) {
|
||||
@@ -1867,18 +2004,17 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
}
|
||||
}
|
||||
virtual bool GraphCaptureEnabled() override {
|
||||
bool isGraphCapture = false;
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
|
||||
hip::MemcpyType type = ihipGetMemcpyType(src_, dst_, kind_);
|
||||
switch (type) {
|
||||
case hipCopyBuffer:
|
||||
isGraphCapture = true;
|
||||
return true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isGraphCapture;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -2139,6 +2275,13 @@ class GraphMemsetNode : public GraphNode {
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool GraphCaptureEnabled() override {
|
||||
if (parentGraph_ != nullptr && parentGraph_->IsSegmentSchedulingEnabled()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
hipError_t CreateCommand(hip::Stream* stream) override {
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
if (status != hipSuccess) {
|
||||
@@ -2319,6 +2462,8 @@ class GraphHostNode : public GraphNode {
|
||||
amd::Command::EventWaitList waitList;
|
||||
commands_.reserve(1);
|
||||
amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList);
|
||||
// This is just to invoke a callback, so no need to flush caches.
|
||||
command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
|
||||
commands_.emplace_back(command);
|
||||
return hipSuccess;
|
||||
}
|
||||
@@ -2333,6 +2478,9 @@ class GraphHostNode : public GraphNode {
|
||||
if (!commands_[0]->setCallback(CL_COMPLETE, GraphHostNode::Callback, &NodeParams_)) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during setCallback");
|
||||
}
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE,
|
||||
"EnqueueCommands: NodeParams_.fn=%p, NodeParams_.userData=%p", NodeParams_.fn,
|
||||
NodeParams_.userData);
|
||||
commands_[0]->enqueue();
|
||||
// Add the new barrier to stall the stream, until the callback is done
|
||||
amd::Command::EventWaitList eventWaitList;
|
||||
@@ -2342,6 +2490,8 @@ class GraphHostNode : public GraphNode {
|
||||
if (block_command == nullptr) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during block command creation");
|
||||
}
|
||||
// This is just to invoke a callback, so no need to flush caches.
|
||||
block_command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
|
||||
block_command->enqueue();
|
||||
block_command->notifyCmdQueue();
|
||||
block_command->release();
|
||||
|
||||
@@ -47,6 +47,8 @@
|
||||
#define KCYN "\x1B[36m"
|
||||
#define KWHT "\x1B[37m"
|
||||
|
||||
template <typename T> T ReturnPtrValue(T* ptr) { return (ptr != nullptr) ? *ptr : nullptr; }
|
||||
|
||||
namespace hip{
|
||||
extern std::once_flag g_ihipInitialized;
|
||||
}
|
||||
|
||||
@@ -34,8 +34,6 @@ namespace hip {
|
||||
amd::Monitor hipArraySetLock{};
|
||||
std::unordered_set<hipArray*> hipArraySet;
|
||||
|
||||
template <typename T> T ReturnPtrValue(T* ptr) { return (ptr != nullptr) ? *ptr : nullptr; }
|
||||
|
||||
// ================================================================================================
|
||||
amd::Memory* getMemoryObject(const void* ptr, size_t& offset, size_t size) {
|
||||
auto memObj = amd::MemObjMap::FindMemObj(ptr, &offset);
|
||||
|
||||
@@ -131,7 +131,7 @@ Settings::Settings() {
|
||||
: HIP_FORCE_DEV_KERNARG;
|
||||
|
||||
limit_blit_wg_ = 16;
|
||||
DEBUG_CLR_GRAPH_PACKET_CAPTURE = false; // disable graph performance optimizations for PAL
|
||||
DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING = 0; // disable graph performance optimizations for PAL
|
||||
}
|
||||
|
||||
bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
|
||||
@@ -352,7 +352,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
}
|
||||
}
|
||||
|
||||
// The hsa copy api would result in a dirty cache state
|
||||
// The ROCR copy api guarantees coherency after the copy
|
||||
gpu().setFenceDirty(false);
|
||||
return true;
|
||||
}
|
||||
@@ -590,7 +590,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
gpu().addSystemScope();
|
||||
// The hsa copy api would result in a dirty cache state
|
||||
// The ROCR copy api guarantees coherency after the copy
|
||||
gpu().setFenceDirty(false);
|
||||
} else {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
|
||||
@@ -553,8 +553,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
|
||||
if (HSA_STATUS_SUCCESS != result) {
|
||||
LogError("hsa_amd_signal_async_handler() failed to set the handler!");
|
||||
} else {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
|
||||
prof_signal->signal_.handle, prof_signal);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG,
|
||||
"Set Handler: handle(0x%lx), timestamp(%p), blocking CB=%d",
|
||||
prof_signal->signal_.handle, prof_signal,
|
||||
ts->command().Callback() != nullptr && ts->GetBlocking());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1009,7 +1011,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
|
||||
|
||||
// Check for queue full and wait if needed.
|
||||
uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
fence_dirty_ = true;
|
||||
setFenceDirty(true);
|
||||
|
||||
if (addSystemScope_) {
|
||||
header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
|
||||
@@ -1024,14 +1026,14 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
|
||||
|
||||
// Reset fence_dirty_ flag if we submit a packet with system scopes
|
||||
if (expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
setFenceDirty(false);
|
||||
}
|
||||
|
||||
// Dirty optimization to save on consequent dispatch packets which have requested flushes
|
||||
if (fence_state_ == amd::Device::kCacheStateSystem &&
|
||||
expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
header = dispatchPacketHeader_;
|
||||
fence_dirty_ = true;
|
||||
setFenceDirty(true);
|
||||
}
|
||||
|
||||
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
|
||||
@@ -1076,7 +1078,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
|
||||
if (header != 0) {
|
||||
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
|
||||
}
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
|
||||
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
|
||||
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
|
||||
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], private_seg_size=%u, "
|
||||
@@ -1204,12 +1206,18 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
|
||||
amd::Os::yield();
|
||||
}
|
||||
|
||||
fence_dirty_ = true;
|
||||
setFenceDirty(true);
|
||||
|
||||
// Save header of first packet in this batch
|
||||
AqlPacket* firstPacket = packets[processedPackets];
|
||||
uint16_t firstPacketHeader = firstPacket->header;
|
||||
uint16_t firstPacketRest = firstPacket->setup;
|
||||
// Separate header for doorbell ring that can be modified
|
||||
uint16_t doorbellHeader = firstPacketHeader;
|
||||
|
||||
// Save header of last packet in this batch (if different from first)
|
||||
AqlPacket* lastPacket = packets[processedPackets + batchSize - 1];
|
||||
uint16_t lastPacketHeader = lastPacket->header;
|
||||
|
||||
// Process batchSize packets
|
||||
for (size_t i = 0; i < batchSize; ++i) {
|
||||
@@ -1217,8 +1225,6 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
|
||||
uint64_t index = startIndex + i;
|
||||
|
||||
AqlPacket* packet = packets[packetIndex];
|
||||
uint16_t header = packet->header;
|
||||
|
||||
|
||||
bool attachSignal = timestamp_ != nullptr || attach_signal;
|
||||
|
||||
@@ -1247,84 +1253,105 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& pa
|
||||
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
|
||||
|
||||
// For first packet in batch, invalidate header before writing
|
||||
if (i == 0) {
|
||||
bool isFirstPacket = (i == 0);
|
||||
bool isLastPacket = (i == batchSize - 1);
|
||||
|
||||
if (isFirstPacket) {
|
||||
if (addSystemScope_) {
|
||||
// Add system scope on the acq on first packet
|
||||
firstPacketHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
|
||||
firstPacketHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
|
||||
// Add system scope on the acq on first packet (modify doorbell header)
|
||||
doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
|
||||
doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
|
||||
}
|
||||
// Invalidate the header of the first packet in the batch
|
||||
packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
|
||||
}
|
||||
|
||||
// Copy the packet and then write the valid of the first packet
|
||||
*aql_loc = *packet;
|
||||
|
||||
// Restore the header of the first packet
|
||||
packet->header = firstPacketHeader;
|
||||
} else {
|
||||
// For the end packet in batch set flags
|
||||
if (i == batchSize - 1) {
|
||||
if (addSystemScope_) {
|
||||
// Add system scope on the release on last packet
|
||||
// For the end packet in batch set flags
|
||||
if (isLastPacket) {
|
||||
if (addSystemScope_) {
|
||||
// If batch has only 1 packet, update doorbell header for release scope
|
||||
// (packet->header is already invalid, so don't modify it)
|
||||
if (batchSize == 1) {
|
||||
doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
} else {
|
||||
// Add system scope on the release on last packet (different from first)
|
||||
packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
addSystemScope_ = false;
|
||||
}
|
||||
auto expected_fence_state =
|
||||
extractAqlBits(packet->header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
// Reset fence_dirty_ flag if we submit a packet with system scopes
|
||||
if (expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
}
|
||||
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
|
||||
addSystemScope_ = false;
|
||||
}
|
||||
|
||||
// Copy the packet to the queue
|
||||
*aql_loc = *packet;
|
||||
// Use doorbellHeader for single packet batch (packet->header is invalid),
|
||||
// else use packet->header
|
||||
uint16_t headerForFenceState = (batchSize == 1) ? doorbellHeader : packet->header;
|
||||
auto expected_fence_state =
|
||||
extractAqlBits(headerForFenceState, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
// Reset fence_dirty_ flag if we submit a packet with system scopes
|
||||
if (expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
setFenceDirty(false);
|
||||
}
|
||||
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
|
||||
}
|
||||
|
||||
// Copy the packet to the queue
|
||||
*aql_loc = *packet;
|
||||
|
||||
// Print kernel name for kernel dispatch packets
|
||||
if (kernelNames && packetIndex < kernelNames->size()) {
|
||||
// Use doorbellHeader for first packet (packet->header is invalid), else use packet->header
|
||||
uint16_t headerForPrint = isFirstPacket ? doorbellHeader : packet->header;
|
||||
uint8_t packetType =
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
|
||||
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
|
||||
if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2, "Graph ShaderName : %s, device id : %u",
|
||||
(*kernelNames)[packetIndex].c_str(), dev().index());
|
||||
|
||||
ClPrint(
|
||||
amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
|
||||
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
|
||||
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
|
||||
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
|
||||
"private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
|
||||
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
|
||||
"rptr=%u, wptr=%u",
|
||||
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
|
||||
packet->setup, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
|
||||
Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
|
||||
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
|
||||
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
|
||||
"setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
|
||||
"private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
|
||||
"kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
|
||||
"rptr=%u, wptr=%u",
|
||||
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, headerForPrint, packetType,
|
||||
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_BARRIER,
|
||||
HSA_PACKET_HEADER_WIDTH_BARRIER),
|
||||
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
|
||||
packet->setup,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
|
||||
Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
|
||||
}
|
||||
}
|
||||
|
||||
// Restore the header of the first packet
|
||||
if (isFirstPacket) {
|
||||
packet->header = firstPacketHeader;
|
||||
}
|
||||
|
||||
// Restore the header of the last packet (if different from first)
|
||||
if (isLastPacket && batchSize > 1) {
|
||||
packet->header = lastPacketHeader;
|
||||
}
|
||||
}
|
||||
|
||||
// Write valid header for the first packet in the batch
|
||||
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask];
|
||||
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), firstPacketHeader, firstPacketRest);
|
||||
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), doorbellHeader, firstPacketRest);
|
||||
|
||||
// Ring doorbell for this batch
|
||||
Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, startIndex);
|
||||
@@ -1367,8 +1394,7 @@ bool VirtualGPU::dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
|
||||
|
||||
dispatchBlockingWait();
|
||||
|
||||
// Add all kernel names in bulk
|
||||
vcmd->addKernelNames(kernelNames);
|
||||
vcmd->setKernelNamesRef(&kernelNames);
|
||||
|
||||
// Dispatch all packets with a single doorbell ring
|
||||
// Cast packets vector to AQL packets vector on the fly
|
||||
@@ -1428,7 +1454,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
uint64_t read = Hsa::queue_load_read_index_relaxed(gpu_queue_);
|
||||
|
||||
fence_dirty_ = true;
|
||||
setFenceDirty(true);
|
||||
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
if (!skipSignal && (signal.handle == 0)) {
|
||||
@@ -1443,7 +1469,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
|
||||
// Reset fence_dirty_ flag if we submit a barrier with system scopes
|
||||
if (cache_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
setFenceDirty(false);
|
||||
}
|
||||
|
||||
while ((index - Hsa::queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
|
||||
@@ -1453,7 +1479,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, 0);
|
||||
|
||||
Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
|
||||
"SWq=0x%zx, HWq=0x%zx, id=%d, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d,"
|
||||
" release=%d), "
|
||||
"dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], completion_signal=0x%zx, "
|
||||
@@ -1512,7 +1538,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
|
||||
}
|
||||
}
|
||||
|
||||
fence_dirty_ = true;
|
||||
setFenceDirty(true);
|
||||
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
|
||||
@@ -1527,7 +1553,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
|
||||
|
||||
// Reset fence_dirty_ flag if we submit a barrier
|
||||
if (cache_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
setFenceDirty(false);
|
||||
}
|
||||
|
||||
uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
@@ -1543,7 +1569,7 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
|
||||
|
||||
Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);
|
||||
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
|
||||
"SWq=0x%zx, HWq=0x%zx, id=%d, BarrierValue Header = 0x%x AmdFormat = 0x%x "
|
||||
"(type=%d, barrier=%d, acquire=%d, release=%d), "
|
||||
"signal=0x%zx, value = 0x%llx mask = 0x%llx cond: %s, completion_signal=0x%zx, "
|
||||
@@ -1576,7 +1602,7 @@ void VirtualGPU::ResetQueueStates() {
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::releaseGpuMemoryFence(bool skip_cpu_wait) {
|
||||
if (hasPendingDispatch_ || !Barriers().IsExternalSignalListEmpty()) {
|
||||
if (hasPendingDispatch_ || isFenceDirty() || !Barriers().IsExternalSignalListEmpty()) {
|
||||
// Dispatch barrier packet into the queue
|
||||
dispatchBarrierPacket(kBarrierPacketHeader);
|
||||
hasPendingDispatch_ = false;
|
||||
@@ -1944,6 +1970,17 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
|
||||
} else {
|
||||
// Assume serialization on the same queue...
|
||||
}
|
||||
|
||||
// Check if the waiting event's queue has a dirty fence and propagate it
|
||||
if (!isFenceDirty()) {
|
||||
amd::Command* wait_cmd = static_cast<amd::Command*>(*it);
|
||||
if (wait_cmd->queue() != nullptr && wait_cmd->queue() != command.queue()) {
|
||||
device::VirtualDevice* wait_vdev = wait_cmd->queue()->vdev();
|
||||
if (wait_vdev != nullptr && wait_vdev->isFenceDirty()) {
|
||||
setFenceDirty(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3688,7 +3725,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
if (isGraphCapture) {
|
||||
argBuffer = command_->getGraphKernArg(gpuKernel.KernargSegmentByteSize(),
|
||||
gpuKernel.KernargSegmentAlignment(), dev().index());
|
||||
command_->SetKernelName(gpuKernel.getDemangledName().c_str());
|
||||
command_->SetKernelName(gpuKernel.getDemangledName());
|
||||
} else {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN,
|
||||
"KernargSegmentByteSize = %lu "
|
||||
@@ -3916,6 +3953,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
if (timestamp_ != nullptr) {
|
||||
const Settings& settings = dev().settings();
|
||||
int32_t releaseFlags = vcmd.getCommandEntryScope();
|
||||
|
||||
if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
|
||||
if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
|
||||
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
|
||||
|
||||
@@ -448,8 +448,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
amd::Command* command() const { return command_; }
|
||||
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
bool isFenceDirty() const { return fence_dirty_; }
|
||||
void setFenceDirty(bool state) { fence_dirty_ = state; }
|
||||
bool isFenceDirty() const { return fence_dirty_.load(std::memory_order_acquire); }
|
||||
void setFenceDirty(bool state) { fence_dirty_.store(state, std::memory_order_release); }
|
||||
void WaitCompleteSignal(hsa_signal_t signal);
|
||||
|
||||
void HiddenHeapInit();
|
||||
|
||||
@@ -1383,6 +1383,7 @@ class AccumulateCommand : public Command {
|
||||
private:
|
||||
//! Kernel names and timestamps list for activity profiling
|
||||
std::vector<std::string> kernelNames_;
|
||||
const std::vector<std::string>* kernelNamesRef_ = nullptr;
|
||||
std::vector<std::pair<uint64_t, uint64_t>> tsList_;
|
||||
|
||||
public:
|
||||
@@ -1399,13 +1400,20 @@ class AccumulateCommand : public Command {
|
||||
kernelNames_.insert(kernelNames_.end(), kernelNames.begin(), kernelNames.end());
|
||||
}
|
||||
|
||||
//! Set kernel names by reference
|
||||
void setKernelNamesRef(const std::vector<std::string>* kernelNames) {
|
||||
kernelNamesRef_ = kernelNames;
|
||||
}
|
||||
|
||||
//! Add kernel timestamp to the list if available
|
||||
void addTimestamps(uint64_t startTs, uint64_t endTs) {
|
||||
tsList_.push_back(std::make_pair(startTs, endTs));
|
||||
}
|
||||
|
||||
//! Return the kernel names
|
||||
const std::vector<std::string>& getKernelNames() const { return kernelNames_; }
|
||||
const std::vector<std::string>& getKernelNames() const {
|
||||
return kernelNamesRef_ != nullptr ? *kernelNamesRef_ : kernelNames_;
|
||||
}
|
||||
|
||||
//! Return the kernel timestamps
|
||||
const std::vector<std::pair<uint64_t, uint64_t>>& getTimestamps() const { return tsList_; }
|
||||
|
||||
@@ -237,8 +237,6 @@ release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \
|
||||
"Set initial heap size for device malloc.") \
|
||||
release(bool, HIP_FORCE_DEV_KERNARG, true, \
|
||||
"Force device mem for kernel args.") \
|
||||
release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \
|
||||
"Enable/Disable graph packet capturing") \
|
||||
release(bool, GPU_DEBUG_ENABLE, false, \
|
||||
"Enables collection of extra info for debugger at some perf cost") \
|
||||
release(cstring, HIPRTC_COMPILE_OPTIONS_APPEND, "", \
|
||||
@@ -255,6 +253,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
|
||||
"Forces the number of streams for the graph parallel execution") \
|
||||
release(uint, DEBUG_HIP_GRAPH_BATCH_SIZE, 256, \
|
||||
"Number of graph nodes to batch at a time") \
|
||||
release(uint, DEBUG_HIP_GRAPH_SEGMENT_SCHEDULING, 1, \
|
||||
"0 = Disable, 1 = Enable, 2 = Force") \
|
||||
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
|
||||
"Blocks synchronization on CPU until the callback processing is done")\
|
||||
release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \
|
||||
|
||||
새 이슈에서 참조
사용자 차단