SWDEV-415772, SWDEV-414682 - Fix childgraph node execution
Change-Id: If9ffc08d98a57b8daa5f131f72ef1bf2317f29e1
[ROCm/clr commit: f76a40c26d]
This commit is contained in:
committed by
Maneesh Gupta
parent
779e5c3a22
commit
b25939b4de
@@ -1227,15 +1227,13 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
|
||||
}
|
||||
std::vector<std::vector<hip::GraphNode*>> parallelLists;
|
||||
std::unordered_map<hip::GraphNode*, std::vector<hip::GraphNode*>> nodeWaitLists;
|
||||
std::unordered_set<hip::UserObject*> graphExeUserObj;
|
||||
clonedGraph->GetRunList(parallelLists, nodeWaitLists);
|
||||
std::vector<hip::GraphNode*> graphNodes;
|
||||
if (false == clonedGraph->TopologicalOrder(graphNodes)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
clonedGraph->GetUserObjs(graphExeUserObj);
|
||||
*pGraphExec = new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedNodes,
|
||||
graphExeUserObj, flags);
|
||||
*pGraphExec =
|
||||
new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedGraph, clonedNodes, flags);
|
||||
if (*pGraphExec != nullptr) {
|
||||
graph->SetGraphInstantiated(true);
|
||||
return (*pGraphExec)->Init();
|
||||
|
||||
@@ -495,8 +495,8 @@ hipError_t GraphExec::Init() {
|
||||
|
||||
hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
|
||||
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
|
||||
std::vector<Node>& topoOrder, std::vector<amd::Command*>& rootCommands,
|
||||
amd::Command*& endCommand, hip::Stream* stream) {
|
||||
std::vector<Node>& topoOrder, Graph* clonedGraph,
|
||||
amd::Command*& graphStart, amd::Command*& graphEnd, hip::Stream* stream) {
|
||||
hipError_t status;
|
||||
for (auto& node : topoOrder) {
|
||||
// TODO: clone commands from next launch
|
||||
@@ -510,44 +510,48 @@ hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
|
||||
}
|
||||
node->UpdateEventWaitLists(waitList);
|
||||
}
|
||||
// rootCommand ensures graph is started (all parallel branches) after all the previous work is
|
||||
// finished
|
||||
bool first = true;
|
||||
for (auto& singleList : parallelLists) {
|
||||
if (first) {
|
||||
first = false;
|
||||
continue;
|
||||
}
|
||||
// marker from the same queue as the list
|
||||
amd::Command* rootCommand = new amd::Marker(*singleList[0]->GetQueue(), false, {});
|
||||
amd::Command::EventWaitList waitList;
|
||||
waitList.push_back(rootCommand);
|
||||
if (!singleList.empty()) {
|
||||
auto commands = singleList[0]->GetCommands();
|
||||
std::vector<Node> rootNodes = clonedGraph->GetRootNodes();
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
|
||||
"[hipGraph] RootCommand get launched on stream (stream:%p)\n", stream);
|
||||
for (auto& root : rootNodes) {
|
||||
//If rootnode is launched on to the same stream dont add dependency
|
||||
if (root->GetQueue() != stream) {
|
||||
if (graphStart == nullptr) {
|
||||
graphStart = new amd::Marker(*stream, false, {});
|
||||
if (graphStart == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
}
|
||||
amd::Command::EventWaitList waitList;
|
||||
waitList.push_back(graphStart);
|
||||
auto commands = root->GetCommands();
|
||||
if (!commands.empty()) {
|
||||
commands[0]->updateEventWaitList(waitList);
|
||||
rootCommands.push_back(rootCommand);
|
||||
}
|
||||
}
|
||||
}
|
||||
// endCommand ensures next enqueued ones start after graph is finished (all parallel branches)
|
||||
|
||||
// graphEnd ensures next enqueued ones start after graph is finished (all parallel branches)
|
||||
amd::Command::EventWaitList graphLastCmdWaitList;
|
||||
first = true;
|
||||
for (auto& singleList : parallelLists) {
|
||||
if (first) {
|
||||
first = false;
|
||||
continue;
|
||||
}
|
||||
if (!singleList.empty()) {
|
||||
auto commands = singleList.back()->GetCommands();
|
||||
std::vector<Node> leafNodes = clonedGraph->GetLeafNodes();
|
||||
|
||||
for (auto& leaf : leafNodes) {
|
||||
// If leaf node is launched on to the same stream dont add dependency
|
||||
if (leaf->GetQueue() != stream) {
|
||||
amd::Command::EventWaitList waitList;
|
||||
waitList.push_back(graphEnd);
|
||||
auto commands = leaf->GetCommands();
|
||||
if (!commands.empty()) {
|
||||
graphLastCmdWaitList.push_back(commands.back());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!graphLastCmdWaitList.empty()) {
|
||||
endCommand = new amd::Marker(*stream, false, graphLastCmdWaitList);
|
||||
if (endCommand == nullptr) {
|
||||
graphEnd = new amd::Marker(*stream, false, graphLastCmdWaitList);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
|
||||
"[hipGraph] EndCommand will get launched on stream (stream:%p)\n", stream);
|
||||
if (graphEnd == nullptr) {
|
||||
graphStart->release();
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
}
|
||||
@@ -592,25 +596,24 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
for (auto& node : topoOrder_) {
|
||||
if (node->GetType() == hipGraphNodeTypeMemAlloc &&
|
||||
static_cast<GraphMemAllocNode*>(node)->IsActiveMem() == true) {
|
||||
return hipErrorInvalidValue;
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
repeatLaunch_ = true;
|
||||
}
|
||||
|
||||
UpdateStream(parallelLists_, hip_stream, this);
|
||||
std::vector<amd::Command*> rootCommands;
|
||||
amd::Command* rootCommand = nullptr;
|
||||
amd::Command* endCommand = nullptr;
|
||||
status =
|
||||
FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, rootCommands, endCommand, hip_stream);
|
||||
status = FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, clonedGraph_, rootCommand,
|
||||
endCommand, hip_stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
for (auto& cmd : rootCommands) {
|
||||
cmd->enqueue();
|
||||
cmd->release();
|
||||
if (rootCommand != nullptr) {
|
||||
rootCommand->enqueue();
|
||||
rootCommand->release();
|
||||
}
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
if (DEBUG_CLR_GRAPH_ENABLE_BUFFERING) {
|
||||
@@ -618,8 +621,7 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
if (parallelLists_.size() == 1) {
|
||||
// Peep through the next node. If current and next node are kernel then enable AQL
|
||||
// buffering
|
||||
if (((i + 1) != topoOrder_.size()) &&
|
||||
topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
|
||||
if (((i + 1) != topoOrder_.size()) && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
|
||||
topoOrder_[i + 1]->GetType() == hipGraphNodeTypeKernel) {
|
||||
topoOrder_[i]->EnableBuffering();
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "hip_platform.hpp"
|
||||
#include "hip_mempool_impl.hpp"
|
||||
#include "hip_vm.hpp"
|
||||
|
||||
namespace hip {
|
||||
struct Graph;
|
||||
struct GraphNode;
|
||||
@@ -42,8 +43,8 @@ struct UserObject;
|
||||
typedef GraphNode* Node;
|
||||
hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
|
||||
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
|
||||
std::vector<Node>& topoOrder, std::vector<amd::Command*>& rootCommands,
|
||||
amd::Command*& endCommand, hip::Stream* stream);
|
||||
std::vector<Node>& topoOrder, Graph* clonedGraph, amd::Command*& graphStart,
|
||||
amd::Command*& graphEnd, hip::Stream* stream);
|
||||
void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* stream,
|
||||
GraphExec* ptr);
|
||||
|
||||
@@ -552,27 +553,26 @@ struct GraphExec {
|
||||
// Topological order of the graph doesn't include nodes embedded as part of the child graph
|
||||
std::vector<Node> topoOrder_;
|
||||
std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
|
||||
struct Graph* clonedGraph_;
|
||||
std::vector<hip::Stream*> parallel_streams_;
|
||||
uint currentQueueIndex_;
|
||||
std::unordered_map<Node, Node> clonedNodes_;
|
||||
amd::Command* lastEnqueuedCommand_;
|
||||
static std::unordered_set<GraphExec*> graphExecSet_;
|
||||
std::unordered_set<UserObject*> graphExeUserObj_;
|
||||
static amd::Monitor graphExecSetLock_;
|
||||
uint64_t flags_ = 0;
|
||||
bool repeatLaunch_ = false;
|
||||
|
||||
public:
|
||||
GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
|
||||
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
|
||||
std::unordered_map<Node, Node>& clonedNodes,
|
||||
std::unordered_set<UserObject*>& userObjs,
|
||||
uint64_t flags = 0)
|
||||
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists, struct Graph*& clonedGraph,
|
||||
std::unordered_map<Node, Node>& clonedNodes, uint64_t flags = 0)
|
||||
: parallelLists_(lists),
|
||||
topoOrder_(topoOrder),
|
||||
nodeWaitLists_(nodeWaitLists),
|
||||
clonedGraph_(clonedGraph),
|
||||
clonedNodes_(clonedNodes),
|
||||
lastEnqueuedCommand_(nullptr),
|
||||
graphExeUserObj_(userObjs),
|
||||
currentQueueIndex_(0),
|
||||
flags_(flags) {
|
||||
amd::ScopedLock lock(graphExecSetLock_);
|
||||
@@ -587,12 +587,9 @@ struct GraphExec {
|
||||
hip::Stream::Destroy(stream);
|
||||
}
|
||||
}
|
||||
for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second;
|
||||
amd::ScopedLock lock(graphExecSetLock_);
|
||||
for (auto userobj : graphExeUserObj_) {
|
||||
userobj->release();
|
||||
}
|
||||
graphExecSet_.erase(this);
|
||||
delete clonedGraph_;
|
||||
}
|
||||
|
||||
Node GetClonedNode(Node node) {
|
||||
@@ -623,11 +620,14 @@ struct ChildGraphNode : public GraphNode {
|
||||
std::vector<std::vector<Node>> parallelLists_;
|
||||
std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
|
||||
amd::Command* lastEnqueuedCommand_;
|
||||
|
||||
amd::Command* startCommand_;
|
||||
amd::Command* endCommand_;
|
||||
public:
|
||||
ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle") {
|
||||
childGraph_ = g->clone();
|
||||
lastEnqueuedCommand_ = nullptr;
|
||||
startCommand_ = nullptr;
|
||||
endCommand_ = nullptr;
|
||||
}
|
||||
|
||||
~ChildGraphNode() { delete childGraph_; }
|
||||
@@ -672,44 +672,41 @@ struct ChildGraphNode : public GraphNode {
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
commands_.reserve(2);
|
||||
std::vector<amd::Command*> rootCommands;
|
||||
amd::Command* endCommand = nullptr;
|
||||
status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, rootCommands,
|
||||
endCommand, stream);
|
||||
for (auto& cmd : rootCommands) {
|
||||
commands_.push_back(cmd);
|
||||
}
|
||||
if (endCommand != nullptr) {
|
||||
commands_.push_back(endCommand);
|
||||
}
|
||||
startCommand_ = nullptr;
|
||||
endCommand_ = nullptr;
|
||||
status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, childGraph_,
|
||||
startCommand_, endCommand_, stream);
|
||||
return status;
|
||||
}
|
||||
|
||||
//
|
||||
void UpdateEventWaitLists(amd::Command::EventWaitList waitList) {
|
||||
parallelLists_[0].front()->UpdateEventWaitLists(waitList);
|
||||
if (startCommand_ != nullptr) {
|
||||
startCommand_->updateEventWaitList(waitList);
|
||||
}
|
||||
}
|
||||
|
||||
void GetRunList(std::vector<std::vector<Node>>& parallelList,
|
||||
std::unordered_map<Node, std::vector<Node>>& dependencies) {
|
||||
childGraph_->GetRunList(parallelLists_, nodeWaitLists_);
|
||||
}
|
||||
bool TopologicalOrder(std::vector<Node>& TopoOrder) { return childGraph_->TopologicalOrder(TopoOrder); }
|
||||
bool TopologicalOrder(std::vector<Node>& TopoOrder) {
|
||||
return childGraph_->TopologicalOrder(TopoOrder);
|
||||
}
|
||||
void EnqueueCommands(hipStream_t stream) {
|
||||
// enqueue child graph start command
|
||||
if (commands_.size() == 1) {
|
||||
commands_[0]->enqueue();
|
||||
commands_[0]->release();
|
||||
if (startCommand_ != nullptr) {
|
||||
startCommand_->enqueue();
|
||||
startCommand_->release();
|
||||
}
|
||||
// enqueue nodes in child graph in level order
|
||||
for (auto& node : childGraphNodeOrder_) {
|
||||
node->EnqueueCommands(stream);
|
||||
}
|
||||
// enqueue child graph end command
|
||||
if (commands_.size() == 2) {
|
||||
commands_[1]->enqueue();
|
||||
commands_[1]->release();
|
||||
if (endCommand_ != nullptr) {
|
||||
endCommand_->enqueue();
|
||||
endCommand_->release();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user