SWDEV-490861 - Remove recursion and extra loop in hipGraphLaunch (#1792)
这个提交包含在:
@@ -196,9 +196,7 @@ void Graph::ScheduleOneNode(Node node, int stream_id) {
|
||||
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->GetChildGraph();
|
||||
child->ScheduleNodes();
|
||||
max_streams_ = std::max(max_streams_, child->max_streams_);
|
||||
if (child->max_streams_ == 1) {
|
||||
reinterpret_cast<hip::ChildGraphNode*>(node)->GraphExec::TopologicalOrder();
|
||||
}
|
||||
reinterpret_cast<hip::ChildGraphNode*>(node)->GraphExec::TopologicalOrder();
|
||||
}
|
||||
for (auto edge : node->GetEdges()) {
|
||||
ScheduleOneNode(edge, stream_id);
|
||||
@@ -910,92 +908,114 @@ void GraphExec::UpdateStreams(hip::Stream* launch_stream) {
|
||||
|
||||
|
||||
// ================================================================================================
|
||||
bool Graph::RunOneNode(Node node, bool wait) {
|
||||
if (node->launch_id_ == -1) {
|
||||
// Clear the storage of the wait nodes
|
||||
memset(&wait_order_[0], 0, sizeof(Node) * wait_order_.size());
|
||||
amd::Command::EventWaitList waitList;
|
||||
// Walk through dependencies and find the last launches on each parallel stream
|
||||
for (auto depNode : node->GetDependencies()) {
|
||||
// Process only the nodes that have been submitted
|
||||
if (depNode->launch_id_ != -1) {
|
||||
// If it's the same stream then skip the signal, since it's in order
|
||||
if (depNode->stream_id_ != node->stream_id_) {
|
||||
// If there is no wait node on the stream, then assign one
|
||||
if ((wait_order_[depNode->stream_id_] == nullptr) ||
|
||||
// If another node executed on the same stream, then use the latest launch only,
|
||||
// since the same stream has in-order run
|
||||
(wait_order_[depNode->stream_id_]->launch_id_ < depNode->launch_id_)) {
|
||||
wait_order_[depNode->stream_id_] = depNode;
|
||||
}
|
||||
bool Graph::RunOneNode(Node node) {
|
||||
// Clear the storage of the wait nodes
|
||||
memset(&wait_order_[0], 0, sizeof(Node) * wait_order_.size());
|
||||
amd::Command::EventWaitList waitList;
|
||||
// Walk through dependencies and find the last launches on each parallel stream
|
||||
for (auto depNode : node->GetDependencies()) {
|
||||
// Process only the nodes that have been submitted
|
||||
if (depNode->launch_id_ != -1) {
|
||||
// If it's the same stream then skip the signal, since it's in order
|
||||
if (depNode->stream_id_ != node->stream_id_) {
|
||||
// If there is no wait node on the stream, then assign one
|
||||
if ((wait_order_[depNode->stream_id_] == nullptr) ||
|
||||
// If another node executed on the same stream, then use the latest launch only,
|
||||
// since the same stream has in-order run
|
||||
(wait_order_[depNode->stream_id_]->launch_id_ < depNode->launch_id_)) {
|
||||
wait_order_[depNode->stream_id_] = depNode;
|
||||
}
|
||||
} else {
|
||||
// It should be a safe return,
|
||||
// since the last edge to this dependency has to submit the command
|
||||
return true;
|
||||
// Release nodes that were enqueued on the same stream, since they are not included in the
|
||||
// wait list. Their references were retained for all outgoing edges.
|
||||
for (auto command : depNode->GetCommands()) {
|
||||
command->release();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
node->SetWait(false);
|
||||
// It should be a safe return,
|
||||
// since the last edge to this dependency has to submit the command
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Create a wait list from the last launches of all dependencies
|
||||
for (auto dep : wait_order_) {
|
||||
if (dep != nullptr) {
|
||||
// Add all commands in the wait list
|
||||
if (dep->GetType() != hipGraphNodeTypeGraph) {
|
||||
for (auto command : dep->GetCommands()) {
|
||||
waitList.push_back(command);
|
||||
}
|
||||
// Create a wait list from the last launches of all dependencies
|
||||
for (auto dep : wait_order_) {
|
||||
if (dep != nullptr) {
|
||||
// Add all commands in the wait list
|
||||
if (dep->GetType() != hipGraphNodeTypeGraph) {
|
||||
for (auto command : dep->GetCommands()) {
|
||||
waitList.push_back(command);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (node->GetType() == hipGraphNodeTypeGraph) {
|
||||
// Process child graph separately, since, there is no connection
|
||||
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->GetChildGraph();
|
||||
if (!reinterpret_cast<hip::ChildGraphNode*>(node)->GetGraphCaptureStatus()) {
|
||||
child->RunNodes(node->stream_id_, &streams_, &waitList);
|
||||
}
|
||||
if (node->GetType() == hipGraphNodeTypeGraph) {
|
||||
// Process child graph separately, since, there is no connection
|
||||
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->GetChildGraph();
|
||||
if (!reinterpret_cast<hip::ChildGraphNode*>(node)->GetGraphCaptureStatus()) {
|
||||
child->RunNodes(node->stream_id_, &streams_, &waitList);
|
||||
}
|
||||
} else {
|
||||
// Assing a stream to the current node
|
||||
node->SetStream(streams_);
|
||||
// Create the execution commands on the assigned stream
|
||||
auto status = node->CreateCommand(node->GetQueue());
|
||||
if (status != hipSuccess) {
|
||||
LogPrintfError("Command creation for node id(%d) failed!", current_id_ + 1);
|
||||
return false;
|
||||
}
|
||||
// If a wait was requested, then process the list
|
||||
if (node->GetWait() && !waitList.empty()) {
|
||||
node->UpdateEventWaitLists(waitList);
|
||||
}
|
||||
// Start the execution
|
||||
node->EnqueueCommands(node->GetQueue());
|
||||
}
|
||||
// Release commands of dependency nodes that were included in the wait list after enqueue
|
||||
for (auto dep : wait_order_) {
|
||||
if (dep != nullptr) {
|
||||
// Add all commands in the wait list
|
||||
if (dep->GetType() != hipGraphNodeTypeGraph) {
|
||||
for (auto command : dep->GetCommands()) {
|
||||
command->release();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Assing a stream to the current node
|
||||
node->SetStream(streams_);
|
||||
// Create the execution commands on the assigned stream
|
||||
auto status = node->CreateCommand(node->GetQueue());
|
||||
if (status != hipSuccess) {
|
||||
LogPrintfError("Command creation for node id(%d) failed!", current_id_ + 1);
|
||||
return false;
|
||||
}
|
||||
// Retain all commands, since potentially the command can finish before a wait signal
|
||||
}
|
||||
}
|
||||
// Assign the launch ID of the submmitted node
|
||||
// This is also applied to childGraphs to prevent them from being reprocessed
|
||||
node->launch_id_ = current_id_++;
|
||||
uint32_t i = 0;
|
||||
// Execute the nodes in the edges list
|
||||
for (auto edge : node->GetEdges()) {
|
||||
// Don't wait in the nodes, executed on the same streams and if it has just one dependency
|
||||
bool wait =
|
||||
((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) || (edge->GetDependencies().size() > 1)) ? true : false;
|
||||
edge->SetWait(wait);
|
||||
i++;
|
||||
// Retain the current node for all its outgoing edges.
|
||||
// Each edge will include this node in its waitlist and release it after their commands are
|
||||
// enqueued.
|
||||
for (auto command : node->GetCommands()) {
|
||||
command->retain();
|
||||
}
|
||||
}
|
||||
if (node->GetEdges().size() == 0) {
|
||||
// Add a leaf node into the list for a wait.
|
||||
// Always use the last node, since it's the latest for the particular queue
|
||||
leafs_[node->stream_id_] = node;
|
||||
// An extra retain is needed for the leaves in order to be able to later enqueue a marker
|
||||
// on the app stream that has these commands in the waitlist.
|
||||
if (node->GetType() != hipGraphNodeTypeGraph) {
|
||||
for (auto command : node->GetCommands()) {
|
||||
command->retain();
|
||||
}
|
||||
|
||||
// If a wait was requested, then process the list
|
||||
if (wait && !waitList.empty()) {
|
||||
node->UpdateEventWaitLists(waitList);
|
||||
}
|
||||
// Start the execution
|
||||
node->EnqueueCommands(node->GetQueue());
|
||||
}
|
||||
// Assign the launch ID of the submmitted node
|
||||
// This is also applied to childGraphs to prevent them from being reprocessed
|
||||
node->launch_id_ = current_id_++;
|
||||
uint32_t i = 0;
|
||||
// Execute the nodes in the edges list
|
||||
for (auto edge : node->GetEdges()) {
|
||||
// Don't wait in the nodes, executed on the same streams and if it has just one dependency
|
||||
bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) || (edge->GetDependencies().size() > 1))
|
||||
? true
|
||||
: false;
|
||||
// Execute the edge node
|
||||
if (!RunOneNode(edge, wait)) {
|
||||
return false;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if (i == 0) {
|
||||
// Add a leaf node into the list for a wait.
|
||||
// Always use the last node, since it's the latest for the particular queue
|
||||
leafs_[node->stream_id_] = node;
|
||||
}
|
||||
}
|
||||
|
||||
node->SetWait(false);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1039,21 +1059,22 @@ bool Graph::RunNodes(int32_t base_stream, const std::vector<hip::Stream*>* paral
|
||||
}
|
||||
|
||||
// Run all commands in the graph
|
||||
for (auto node : vertices_) {
|
||||
if (node->launch_id_ == -1) {
|
||||
if (!RunOneNode(node, true)) {
|
||||
return false;
|
||||
}
|
||||
for (auto node : GetTopoOrder()) {
|
||||
node->launch_id_ = -1;
|
||||
if (!RunOneNode(node)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
wait_list.clear();
|
||||
// Check if the graph has multiple leaf nodes
|
||||
for (uint32_t i = 0; i < DEBUG_HIP_FORCE_GRAPH_QUEUES; ++i) {
|
||||
if ((base_stream != i) && (leafs_[i] != nullptr)) {
|
||||
if ((leafs_[i] != nullptr) && (leafs_[i]->GetType() != hipGraphNodeTypeGraph)) {
|
||||
// Add all commands in the wait list
|
||||
if (leafs_[i]->GetType() != hipGraphNodeTypeGraph) {
|
||||
for (auto command : leafs_[i]->GetCommands()) {
|
||||
for (auto command : leafs_[i]->GetCommands()) {
|
||||
if (base_stream != i) {
|
||||
wait_list.push_back(command);
|
||||
} else {
|
||||
command->release();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1065,16 +1086,11 @@ bool Graph::RunNodes(int32_t base_stream, const std::vector<hip::Stream*>* paral
|
||||
end_marker->enqueue();
|
||||
end_marker->release();
|
||||
}
|
||||
}
|
||||
// Release commands after execution
|
||||
for (auto& node : vertices_) {
|
||||
node->launch_id_ = -1;
|
||||
if (node->GetType() != hipGraphNodeTypeGraph) {
|
||||
for (auto command : node->GetCommands()) {
|
||||
command->release();
|
||||
}
|
||||
for (auto command : wait_list) {
|
||||
command->release();
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -462,6 +462,8 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
}
|
||||
void SetDeviceId(int id) { dev_id_ = id; }
|
||||
int GetDeviceId() const { return dev_id_; }
|
||||
bool GetWait() const { return wait_; }
|
||||
void SetWait(bool wait) { wait_ = wait; }
|
||||
|
||||
protected:
|
||||
// Declare Graph and GraphExec as friends of node for simpler access to GraphNode fields
|
||||
@@ -492,6 +494,7 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
size_t kernargSegmentAlignment_ = 256; //!< Kernel arg segment alignment
|
||||
int dev_id_; //!< Device Id when node is created(dev id from capture stream/current device
|
||||
//!< when explicitly added)
|
||||
bool wait_ = false;
|
||||
};
|
||||
|
||||
class GraphEventWaitNode : public GraphNode {
|
||||
@@ -633,6 +636,7 @@ class Graph {
|
||||
size_t GetNodeCount() const { return vertices_.size(); }
|
||||
/// returns all the nodes in the graph
|
||||
const std::vector<Node>& GetNodes() const { return vertices_; }
|
||||
const std::vector<Node>& GetTopoOrder() const { return topoOrder_; }
|
||||
/// returns all the edges in the graph
|
||||
std::vector<std::pair<Node, Node>> GetEdges() const;
|
||||
// returns the original graph ptr if cloned
|
||||
@@ -678,9 +682,7 @@ class Graph {
|
||||
void ScheduleNodes();
|
||||
|
||||
//! Runs one node on the assigned stream
|
||||
bool RunOneNode(Node node, //!< Node for the execution on GPU
|
||||
bool wait //!< Wait dependencies
|
||||
);
|
||||
bool RunOneNode(Node node); //!< Node for the execution on GPU
|
||||
|
||||
//! Runs all nodes from the execution graph on the assigned streams
|
||||
bool RunNodes(
|
||||
@@ -783,7 +785,8 @@ class Graph {
|
||||
//!< during multi-device graph execution scheduling.
|
||||
std::unordered_map<int, std::set<int>> streams_dev_ids_;
|
||||
int instantiateDeviceId_ = -1;
|
||||
|
||||
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
|
||||
std::vector<Node> topoOrder_;
|
||||
private:
|
||||
friend class GraphExec;
|
||||
std::vector<Node> vertices_;
|
||||
@@ -894,8 +897,6 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
void FindStreamsReqPerDev();
|
||||
|
||||
protected:
|
||||
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
|
||||
std::vector<Node> topoOrder_;
|
||||
//! parallel streams per device
|
||||
std::unordered_map<int, std::vector<hip::Stream*>> parallel_streams_;
|
||||
uint64_t flags_ = 0;
|
||||
|
||||
在新工单中引用
屏蔽一个用户