From b25939b4de53b2bb5bc4447fd22bc1c8237befdf Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Tue, 8 Aug 2023 13:25:46 +0000 Subject: [PATCH] SWDEV-415772, SWDEV-414682 - Fix childgraph node execution Change-Id: If9ffc08d98a57b8daa5f131f72ef1bf2317f29e1 [ROCm/clr commit: f76a40c26d14b74bd82a14db8bbf05dfb2f1de68] --- projects/clr/hipamd/src/hip_graph.cpp | 6 +- .../clr/hipamd/src/hip_graph_internal.cpp | 80 ++++++++++--------- .../clr/hipamd/src/hip_graph_internal.hpp | 61 +++++++------- 3 files changed, 72 insertions(+), 75 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph.cpp b/projects/clr/hipamd/src/hip_graph.cpp index 8db887607b..c0567381e4 100644 --- a/projects/clr/hipamd/src/hip_graph.cpp +++ b/projects/clr/hipamd/src/hip_graph.cpp @@ -1227,15 +1227,13 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph, } std::vector> parallelLists; std::unordered_map> nodeWaitLists; - std::unordered_set graphExeUserObj; clonedGraph->GetRunList(parallelLists, nodeWaitLists); std::vector graphNodes; if (false == clonedGraph->TopologicalOrder(graphNodes)) { return hipErrorInvalidValue; } - clonedGraph->GetUserObjs(graphExeUserObj); - *pGraphExec = new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedNodes, - graphExeUserObj, flags); + *pGraphExec = + new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedGraph, clonedNodes, flags); if (*pGraphExec != nullptr) { graph->SetGraphInstantiated(true); return (*pGraphExec)->Init(); diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 4d39c566d9..cb71e4583f 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -495,8 +495,8 @@ hipError_t GraphExec::Init() { hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, - std::vector& topoOrder, std::vector& rootCommands, - amd::Command*& endCommand, hip::Stream* stream) { + std::vector& topoOrder, Graph* clonedGraph, + amd::Command*& graphStart, amd::Command*& graphEnd, hip::Stream* stream) { hipError_t status; for (auto& node : topoOrder) { // TODO: clone commands from next launch @@ -510,44 +510,48 @@ hipError_t FillCommands(std::vector>& parallelLists, } node->UpdateEventWaitLists(waitList); } - // rootCommand ensures graph is started (all parallel branches) after all the previous work is - // finished - bool first = true; - for (auto& singleList : parallelLists) { - if (first) { - first = false; - continue; - } - // marker from the same queue as the list - amd::Command* rootCommand = new amd::Marker(*singleList[0]->GetQueue(), false, {}); - amd::Command::EventWaitList waitList; - waitList.push_back(rootCommand); - if (!singleList.empty()) { - auto commands = singleList[0]->GetCommands(); + std::vector rootNodes = clonedGraph->GetRootNodes(); + ClPrint(amd::LOG_INFO, amd::LOG_CODE, + "[hipGraph] RootCommand get launched on stream (stream:%p)\n", stream); + for (auto& root : rootNodes) { + //If rootnode is launched on to the same stream dont add dependency + if (root->GetQueue() != stream) { + if (graphStart == nullptr) { + graphStart = new amd::Marker(*stream, false, {}); + if (graphStart == nullptr) { + return hipErrorOutOfMemory; + } + } + amd::Command::EventWaitList waitList; + waitList.push_back(graphStart); + auto commands = root->GetCommands(); if (!commands.empty()) { commands[0]->updateEventWaitList(waitList); - rootCommands.push_back(rootCommand); } } } - // endCommand ensures next enqueued ones start after graph is finished (all parallel branches) + + // graphEnd ensures next enqueued ones start after graph is finished (all parallel branches) amd::Command::EventWaitList graphLastCmdWaitList; - first = true; - for (auto& singleList : parallelLists) { - if (first) { - first = false; - continue; - } - if (!singleList.empty()) { - auto commands = singleList.back()->GetCommands(); + std::vector leafNodes = clonedGraph->GetLeafNodes(); + + for (auto& leaf : leafNodes) { + // If leaf node is launched on to the same stream dont add dependency + if (leaf->GetQueue() != stream) { + amd::Command::EventWaitList waitList; + waitList.push_back(graphEnd); + auto commands = leaf->GetCommands(); if (!commands.empty()) { graphLastCmdWaitList.push_back(commands.back()); } } } if (!graphLastCmdWaitList.empty()) { - endCommand = new amd::Marker(*stream, false, graphLastCmdWaitList); - if (endCommand == nullptr) { + graphEnd = new amd::Marker(*stream, false, graphLastCmdWaitList); + ClPrint(amd::LOG_INFO, amd::LOG_CODE, + "[hipGraph] EndCommand will get launched on stream (stream:%p)\n", stream); + if (graphEnd == nullptr) { + graphStart->release(); return hipErrorOutOfMemory; } } @@ -592,25 +596,24 @@ hipError_t GraphExec::Run(hipStream_t stream) { for (auto& node : topoOrder_) { if (node->GetType() == hipGraphNodeTypeMemAlloc && static_cast(node)->IsActiveMem() == true) { - return hipErrorInvalidValue; + return hipErrorInvalidValue; } } - } - else { + } else { repeatLaunch_ = true; } UpdateStream(parallelLists_, hip_stream, this); - std::vector rootCommands; + amd::Command* rootCommand = nullptr; amd::Command* endCommand = nullptr; - status = - FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, rootCommands, endCommand, hip_stream); + status = FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, clonedGraph_, rootCommand, + endCommand, hip_stream); if (status != hipSuccess) { return status; } - for (auto& cmd : rootCommands) { - cmd->enqueue(); - cmd->release(); + if (rootCommand != nullptr) { + rootCommand->enqueue(); + rootCommand->release(); } for (int i = 0; i < topoOrder_.size(); i++) { if (DEBUG_CLR_GRAPH_ENABLE_BUFFERING) { @@ -618,8 +621,7 @@ hipError_t GraphExec::Run(hipStream_t stream) { if (parallelLists_.size() == 1) { // Peep through the next node. If current and next node are kernel then enable AQL // buffering - if (((i + 1) != topoOrder_.size()) && - topoOrder_[i]->GetType() == hipGraphNodeTypeKernel && + if (((i + 1) != topoOrder_.size()) && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel && topoOrder_[i + 1]->GetType() == hipGraphNodeTypeKernel) { topoOrder_[i]->EnableBuffering(); } diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 468c368c9d..3cf402c24b 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -34,6 +34,7 @@ #include "hip_platform.hpp" #include "hip_mempool_impl.hpp" #include "hip_vm.hpp" + namespace hip { struct Graph; struct GraphNode; @@ -42,8 +43,8 @@ struct UserObject; typedef GraphNode* Node; hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, - std::vector& topoOrder, std::vector& rootCommands, - amd::Command*& endCommand, hip::Stream* stream); + std::vector& topoOrder, Graph* clonedGraph, amd::Command*& graphStart, + amd::Command*& graphEnd, hip::Stream* stream); void UpdateStream(std::vector>& parallelLists, hip::Stream* stream, GraphExec* ptr); @@ -552,27 +553,26 @@ struct GraphExec { // Topological order of the graph doesn't include nodes embedded as part of the child graph std::vector topoOrder_; std::unordered_map> nodeWaitLists_; + struct Graph* clonedGraph_; std::vector parallel_streams_; uint currentQueueIndex_; std::unordered_map clonedNodes_; amd::Command* lastEnqueuedCommand_; static std::unordered_set graphExecSet_; - std::unordered_set graphExeUserObj_; static amd::Monitor graphExecSetLock_; uint64_t flags_ = 0; bool repeatLaunch_ = false; + public: GraphExec(std::vector& topoOrder, std::vector>& lists, - std::unordered_map>& nodeWaitLists, - std::unordered_map& clonedNodes, - std::unordered_set& userObjs, - uint64_t flags = 0) + std::unordered_map>& nodeWaitLists, struct Graph*& clonedGraph, + std::unordered_map& clonedNodes, uint64_t flags = 0) : parallelLists_(lists), topoOrder_(topoOrder), nodeWaitLists_(nodeWaitLists), + clonedGraph_(clonedGraph), clonedNodes_(clonedNodes), lastEnqueuedCommand_(nullptr), - graphExeUserObj_(userObjs), currentQueueIndex_(0), flags_(flags) { amd::ScopedLock lock(graphExecSetLock_); @@ -587,12 +587,9 @@ struct GraphExec { hip::Stream::Destroy(stream); } } - for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second; amd::ScopedLock lock(graphExecSetLock_); - for (auto userobj : graphExeUserObj_) { - userobj->release(); - } graphExecSet_.erase(this); + delete clonedGraph_; } Node GetClonedNode(Node node) { @@ -623,11 +620,14 @@ struct ChildGraphNode : public GraphNode { std::vector> parallelLists_; std::unordered_map> nodeWaitLists_; amd::Command* lastEnqueuedCommand_; - + amd::Command* startCommand_; + amd::Command* endCommand_; public: ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle") { childGraph_ = g->clone(); lastEnqueuedCommand_ = nullptr; + startCommand_ = nullptr; + endCommand_ = nullptr; } ~ChildGraphNode() { delete childGraph_; } @@ -672,44 +672,41 @@ struct ChildGraphNode : public GraphNode { if (status != hipSuccess) { return status; } - commands_.reserve(2); - std::vector rootCommands; - amd::Command* endCommand = nullptr; - status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, rootCommands, - endCommand, stream); - for (auto& cmd : rootCommands) { - commands_.push_back(cmd); - } - if (endCommand != nullptr) { - commands_.push_back(endCommand); - } + startCommand_ = nullptr; + endCommand_ = nullptr; + status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, childGraph_, + startCommand_, endCommand_, stream); return status; } // void UpdateEventWaitLists(amd::Command::EventWaitList waitList) { - parallelLists_[0].front()->UpdateEventWaitLists(waitList); + if (startCommand_ != nullptr) { + startCommand_->updateEventWaitList(waitList); + } } void GetRunList(std::vector>& parallelList, std::unordered_map>& dependencies) { childGraph_->GetRunList(parallelLists_, nodeWaitLists_); } - bool TopologicalOrder(std::vector& TopoOrder) { return childGraph_->TopologicalOrder(TopoOrder); } + bool TopologicalOrder(std::vector& TopoOrder) { + return childGraph_->TopologicalOrder(TopoOrder); + } void EnqueueCommands(hipStream_t stream) { // enqueue child graph start command - if (commands_.size() == 1) { - commands_[0]->enqueue(); - commands_[0]->release(); + if (startCommand_ != nullptr) { + startCommand_->enqueue(); + startCommand_->release(); } // enqueue nodes in child graph in level order for (auto& node : childGraphNodeOrder_) { node->EnqueueCommands(stream); } // enqueue child graph end command - if (commands_.size() == 2) { - commands_[1]->enqueue(); - commands_[1]->release(); + if (endCommand_ != nullptr) { + endCommand_->enqueue(); + endCommand_->release(); } }