SWDEV-415772, SWDEV-414682 - Fix childgraph node execution

Change-Id: If9ffc08d98a57b8daa5f131f72ef1bf2317f29e1


[ROCm/clr commit: f76a40c26d]
This commit is contained in:
Anusha GodavarthySurya
2023-08-08 13:25:46 +00:00
committed by Maneesh Gupta
parent 779e5c3a22
commit b25939b4de
3 changed files with 72 additions and 75 deletions
+2 -4
View File
@@ -1227,15 +1227,13 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
}
std::vector<std::vector<hip::GraphNode*>> parallelLists;
std::unordered_map<hip::GraphNode*, std::vector<hip::GraphNode*>> nodeWaitLists;
std::unordered_set<hip::UserObject*> graphExeUserObj;
clonedGraph->GetRunList(parallelLists, nodeWaitLists);
std::vector<hip::GraphNode*> graphNodes;
if (false == clonedGraph->TopologicalOrder(graphNodes)) {
return hipErrorInvalidValue;
}
clonedGraph->GetUserObjs(graphExeUserObj);
*pGraphExec = new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedNodes,
graphExeUserObj, flags);
*pGraphExec =
new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedGraph, clonedNodes, flags);
if (*pGraphExec != nullptr) {
graph->SetGraphInstantiated(true);
return (*pGraphExec)->Init();
+41 -39
View File
@@ -495,8 +495,8 @@ hipError_t GraphExec::Init() {
hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
std::vector<Node>& topoOrder, std::vector<amd::Command*>& rootCommands,
amd::Command*& endCommand, hip::Stream* stream) {
std::vector<Node>& topoOrder, Graph* clonedGraph,
amd::Command*& graphStart, amd::Command*& graphEnd, hip::Stream* stream) {
hipError_t status;
for (auto& node : topoOrder) {
// TODO: clone commands from next launch
@@ -510,44 +510,48 @@ hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
}
node->UpdateEventWaitLists(waitList);
}
// rootCommand ensures graph is started (all parallel branches) after all the previous work is
// finished
bool first = true;
for (auto& singleList : parallelLists) {
if (first) {
first = false;
continue;
}
// marker from the same queue as the list
amd::Command* rootCommand = new amd::Marker(*singleList[0]->GetQueue(), false, {});
amd::Command::EventWaitList waitList;
waitList.push_back(rootCommand);
if (!singleList.empty()) {
auto commands = singleList[0]->GetCommands();
std::vector<Node> rootNodes = clonedGraph->GetRootNodes();
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
"[hipGraph] RootCommand get launched on stream (stream:%p)\n", stream);
for (auto& root : rootNodes) {
//If rootnode is launched on to the same stream dont add dependency
if (root->GetQueue() != stream) {
if (graphStart == nullptr) {
graphStart = new amd::Marker(*stream, false, {});
if (graphStart == nullptr) {
return hipErrorOutOfMemory;
}
}
amd::Command::EventWaitList waitList;
waitList.push_back(graphStart);
auto commands = root->GetCommands();
if (!commands.empty()) {
commands[0]->updateEventWaitList(waitList);
rootCommands.push_back(rootCommand);
}
}
}
// endCommand ensures next enqueued ones start after graph is finished (all parallel branches)
// graphEnd ensures next enqueued ones start after graph is finished (all parallel branches)
amd::Command::EventWaitList graphLastCmdWaitList;
first = true;
for (auto& singleList : parallelLists) {
if (first) {
first = false;
continue;
}
if (!singleList.empty()) {
auto commands = singleList.back()->GetCommands();
std::vector<Node> leafNodes = clonedGraph->GetLeafNodes();
for (auto& leaf : leafNodes) {
// If leaf node is launched on to the same stream dont add dependency
if (leaf->GetQueue() != stream) {
amd::Command::EventWaitList waitList;
waitList.push_back(graphEnd);
auto commands = leaf->GetCommands();
if (!commands.empty()) {
graphLastCmdWaitList.push_back(commands.back());
}
}
}
if (!graphLastCmdWaitList.empty()) {
endCommand = new amd::Marker(*stream, false, graphLastCmdWaitList);
if (endCommand == nullptr) {
graphEnd = new amd::Marker(*stream, false, graphLastCmdWaitList);
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
"[hipGraph] EndCommand will get launched on stream (stream:%p)\n", stream);
if (graphEnd == nullptr) {
graphStart->release();
return hipErrorOutOfMemory;
}
}
@@ -592,25 +596,24 @@ hipError_t GraphExec::Run(hipStream_t stream) {
for (auto& node : topoOrder_) {
if (node->GetType() == hipGraphNodeTypeMemAlloc &&
static_cast<GraphMemAllocNode*>(node)->IsActiveMem() == true) {
return hipErrorInvalidValue;
return hipErrorInvalidValue;
}
}
}
else {
} else {
repeatLaunch_ = true;
}
UpdateStream(parallelLists_, hip_stream, this);
std::vector<amd::Command*> rootCommands;
amd::Command* rootCommand = nullptr;
amd::Command* endCommand = nullptr;
status =
FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, rootCommands, endCommand, hip_stream);
status = FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, clonedGraph_, rootCommand,
endCommand, hip_stream);
if (status != hipSuccess) {
return status;
}
for (auto& cmd : rootCommands) {
cmd->enqueue();
cmd->release();
if (rootCommand != nullptr) {
rootCommand->enqueue();
rootCommand->release();
}
for (int i = 0; i < topoOrder_.size(); i++) {
if (DEBUG_CLR_GRAPH_ENABLE_BUFFERING) {
@@ -618,8 +621,7 @@ hipError_t GraphExec::Run(hipStream_t stream) {
if (parallelLists_.size() == 1) {
// Peep through the next node. If current and next node are kernel then enable AQL
// buffering
if (((i + 1) != topoOrder_.size()) &&
topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
if (((i + 1) != topoOrder_.size()) && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
topoOrder_[i + 1]->GetType() == hipGraphNodeTypeKernel) {
topoOrder_[i]->EnableBuffering();
}
+29 -32
View File
@@ -34,6 +34,7 @@
#include "hip_platform.hpp"
#include "hip_mempool_impl.hpp"
#include "hip_vm.hpp"
namespace hip {
struct Graph;
struct GraphNode;
@@ -42,8 +43,8 @@ struct UserObject;
typedef GraphNode* Node;
hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
std::vector<Node>& topoOrder, std::vector<amd::Command*>& rootCommands,
amd::Command*& endCommand, hip::Stream* stream);
std::vector<Node>& topoOrder, Graph* clonedGraph, amd::Command*& graphStart,
amd::Command*& graphEnd, hip::Stream* stream);
void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* stream,
GraphExec* ptr);
@@ -552,27 +553,26 @@ struct GraphExec {
// Topological order of the graph doesn't include nodes embedded as part of the child graph
std::vector<Node> topoOrder_;
std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
struct Graph* clonedGraph_;
std::vector<hip::Stream*> parallel_streams_;
uint currentQueueIndex_;
std::unordered_map<Node, Node> clonedNodes_;
amd::Command* lastEnqueuedCommand_;
static std::unordered_set<GraphExec*> graphExecSet_;
std::unordered_set<UserObject*> graphExeUserObj_;
static amd::Monitor graphExecSetLock_;
uint64_t flags_ = 0;
bool repeatLaunch_ = false;
public:
GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
std::unordered_map<Node, Node>& clonedNodes,
std::unordered_set<UserObject*>& userObjs,
uint64_t flags = 0)
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists, struct Graph*& clonedGraph,
std::unordered_map<Node, Node>& clonedNodes, uint64_t flags = 0)
: parallelLists_(lists),
topoOrder_(topoOrder),
nodeWaitLists_(nodeWaitLists),
clonedGraph_(clonedGraph),
clonedNodes_(clonedNodes),
lastEnqueuedCommand_(nullptr),
graphExeUserObj_(userObjs),
currentQueueIndex_(0),
flags_(flags) {
amd::ScopedLock lock(graphExecSetLock_);
@@ -587,12 +587,9 @@ struct GraphExec {
hip::Stream::Destroy(stream);
}
}
for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second;
amd::ScopedLock lock(graphExecSetLock_);
for (auto userobj : graphExeUserObj_) {
userobj->release();
}
graphExecSet_.erase(this);
delete clonedGraph_;
}
Node GetClonedNode(Node node) {
@@ -623,11 +620,14 @@ struct ChildGraphNode : public GraphNode {
std::vector<std::vector<Node>> parallelLists_;
std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
amd::Command* lastEnqueuedCommand_;
amd::Command* startCommand_;
amd::Command* endCommand_;
public:
ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle") {
childGraph_ = g->clone();
lastEnqueuedCommand_ = nullptr;
startCommand_ = nullptr;
endCommand_ = nullptr;
}
~ChildGraphNode() { delete childGraph_; }
@@ -672,44 +672,41 @@ struct ChildGraphNode : public GraphNode {
if (status != hipSuccess) {
return status;
}
commands_.reserve(2);
std::vector<amd::Command*> rootCommands;
amd::Command* endCommand = nullptr;
status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, rootCommands,
endCommand, stream);
for (auto& cmd : rootCommands) {
commands_.push_back(cmd);
}
if (endCommand != nullptr) {
commands_.push_back(endCommand);
}
startCommand_ = nullptr;
endCommand_ = nullptr;
status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, childGraph_,
startCommand_, endCommand_, stream);
return status;
}
//
void UpdateEventWaitLists(amd::Command::EventWaitList waitList) {
parallelLists_[0].front()->UpdateEventWaitLists(waitList);
if (startCommand_ != nullptr) {
startCommand_->updateEventWaitList(waitList);
}
}
void GetRunList(std::vector<std::vector<Node>>& parallelList,
std::unordered_map<Node, std::vector<Node>>& dependencies) {
childGraph_->GetRunList(parallelLists_, nodeWaitLists_);
}
bool TopologicalOrder(std::vector<Node>& TopoOrder) { return childGraph_->TopologicalOrder(TopoOrder); }
bool TopologicalOrder(std::vector<Node>& TopoOrder) {
return childGraph_->TopologicalOrder(TopoOrder);
}
void EnqueueCommands(hipStream_t stream) {
// enqueue child graph start command
if (commands_.size() == 1) {
commands_[0]->enqueue();
commands_[0]->release();
if (startCommand_ != nullptr) {
startCommand_->enqueue();
startCommand_->release();
}
// enqueue nodes in child graph in level order
for (auto& node : childGraphNodeOrder_) {
node->EnqueueCommands(stream);
}
// enqueue child graph end command
if (commands_.size() == 2) {
commands_[1]->enqueue();
commands_[1]->release();
if (endCommand_ != nullptr) {
endCommand_->enqueue();
endCommand_->release();
}
}