SWDEV-415772, SWDEV-414682 - Fix childgraph node execution

Change-Id: If9ffc08d98a57b8daa5f131f72ef1bf2317f29e1 [ROCm/clr commit: f76a40c26d]
2023-08-08 13:25:46 +00:00
parent 779e5c3a22
commit b25939b4de
3 changed files with 72 additions and 75 deletions
@@ -1227,15 +1227,13 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
  }
  std::vector<std::vector<hip::GraphNode*>> parallelLists;
  std::unordered_map<hip::GraphNode*, std::vector<hip::GraphNode*>> nodeWaitLists;
-  std::unordered_set<hip::UserObject*> graphExeUserObj;
  clonedGraph->GetRunList(parallelLists, nodeWaitLists);
  std::vector<hip::GraphNode*> graphNodes;
  if (false == clonedGraph->TopologicalOrder(graphNodes)) {
    return hipErrorInvalidValue;
  }
-  clonedGraph->GetUserObjs(graphExeUserObj);
-  *pGraphExec = new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedNodes,
-                                   graphExeUserObj, flags);
+  *pGraphExec =
+      new hip::GraphExec(graphNodes, parallelLists, nodeWaitLists, clonedGraph, clonedNodes, flags);
  if (*pGraphExec != nullptr) {
    graph->SetGraphInstantiated(true);
    return (*pGraphExec)->Init();
@@ -495,8 +495,8 @@ hipError_t GraphExec::Init() {

 hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
                        std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
-                        std::vector<Node>& topoOrder, std::vector<amd::Command*>& rootCommands,
-                        amd::Command*& endCommand, hip::Stream* stream) {
+                        std::vector<Node>& topoOrder, Graph* clonedGraph,
+                        amd::Command*& graphStart, amd::Command*& graphEnd, hip::Stream* stream) {
  hipError_t status;
  for (auto& node : topoOrder) {
    // TODO: clone commands from next launch
@@ -510,44 +510,48 @@ hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
    }
    node->UpdateEventWaitLists(waitList);
  }
-  // rootCommand ensures graph is started (all parallel branches) after all the previous work is
-  // finished
-  bool first = true;
-  for (auto& singleList : parallelLists) {
-    if (first) {
-      first = false;
-      continue;
-    }
-    // marker from the same queue as the list
-    amd::Command* rootCommand = new amd::Marker(*singleList[0]->GetQueue(), false, {});
-    amd::Command::EventWaitList waitList;
-    waitList.push_back(rootCommand);
-    if (!singleList.empty()) {
-      auto commands = singleList[0]->GetCommands();
+  std::vector<Node> rootNodes = clonedGraph->GetRootNodes();
+  ClPrint(amd::LOG_INFO, amd::LOG_CODE,
+          "[hipGraph] RootCommand get launched on stream (stream:%p)\n", stream);
+  for (auto& root : rootNodes) {
+    //If rootnode is launched on to the same stream dont add dependency
+    if (root->GetQueue() != stream) {
+      if (graphStart == nullptr) {
+        graphStart = new amd::Marker(*stream, false, {});
+        if (graphStart == nullptr) {
+          return hipErrorOutOfMemory;
+        }
+      }
+      amd::Command::EventWaitList waitList;
+      waitList.push_back(graphStart);
+      auto commands = root->GetCommands();
      if (!commands.empty()) {
        commands[0]->updateEventWaitList(waitList);
-        rootCommands.push_back(rootCommand);
      }
    }
  }
-  // endCommand ensures next enqueued ones start after graph is finished (all parallel branches)
+
+  // graphEnd ensures next enqueued ones start after graph is finished (all parallel branches)
  amd::Command::EventWaitList graphLastCmdWaitList;
-  first = true;
-  for (auto& singleList : parallelLists) {
-    if (first) {
-      first = false;
-      continue;
-    }
-    if (!singleList.empty()) {
-      auto commands = singleList.back()->GetCommands();
+  std::vector<Node> leafNodes = clonedGraph->GetLeafNodes();
+
+  for (auto& leaf : leafNodes) {
+    // If leaf node is launched on to the same stream dont add dependency
+    if (leaf->GetQueue() != stream) {
+      amd::Command::EventWaitList waitList;
+      waitList.push_back(graphEnd);
+      auto commands = leaf->GetCommands();
      if (!commands.empty()) {
        graphLastCmdWaitList.push_back(commands.back());
      }
    }
  }
  if (!graphLastCmdWaitList.empty()) {
-    endCommand = new amd::Marker(*stream, false, graphLastCmdWaitList);
-    if (endCommand == nullptr) {
+    graphEnd = new amd::Marker(*stream, false, graphLastCmdWaitList);
+    ClPrint(amd::LOG_INFO, amd::LOG_CODE,
+            "[hipGraph] EndCommand will get launched on stream (stream:%p)\n", stream);
+    if (graphEnd == nullptr) {
+      graphStart->release();
      return hipErrorOutOfMemory;
    }
  }
@@ -592,25 +596,24 @@ hipError_t GraphExec::Run(hipStream_t stream) {
    for (auto& node : topoOrder_) {
      if (node->GetType() == hipGraphNodeTypeMemAlloc &&
          static_cast<GraphMemAllocNode*>(node)->IsActiveMem() == true) {
-          return hipErrorInvalidValue;
+        return hipErrorInvalidValue;
      }
    }
-  }
-  else {
+  } else {
    repeatLaunch_ = true;
  }

  UpdateStream(parallelLists_, hip_stream, this);
-  std::vector<amd::Command*> rootCommands;
+  amd::Command* rootCommand = nullptr;
  amd::Command* endCommand = nullptr;
-  status =
-      FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, rootCommands, endCommand, hip_stream);
+  status = FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, clonedGraph_, rootCommand,
+                        endCommand, hip_stream);
  if (status != hipSuccess) {
    return status;
  }
-  for (auto& cmd : rootCommands) {
-    cmd->enqueue();
-    cmd->release();
+  if (rootCommand != nullptr) {
+    rootCommand->enqueue();
+    rootCommand->release();
  }
  for (int i = 0; i < topoOrder_.size(); i++) {
    if (DEBUG_CLR_GRAPH_ENABLE_BUFFERING) {
@@ -618,8 +621,7 @@ hipError_t GraphExec::Run(hipStream_t stream) {
      if (parallelLists_.size() == 1) {
        // Peep through the next node. If current and next node are kernel then enable AQL
        // buffering
-        if (((i + 1) != topoOrder_.size()) &&
-            topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
+        if (((i + 1) != topoOrder_.size()) && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
            topoOrder_[i + 1]->GetType() == hipGraphNodeTypeKernel) {
          topoOrder_[i]->EnableBuffering();
        }
@@ -34,6 +34,7 @@
 #include "hip_platform.hpp"
 #include "hip_mempool_impl.hpp"
 #include "hip_vm.hpp"
+
 namespace hip {
 struct Graph;
 struct GraphNode;
@@ -42,8 +43,8 @@ struct UserObject;
 typedef GraphNode* Node;
 hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
                        std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
-                        std::vector<Node>& topoOrder, std::vector<amd::Command*>& rootCommands,
-                        amd::Command*& endCommand, hip::Stream* stream);
+                        std::vector<Node>& topoOrder, Graph* clonedGraph, amd::Command*& graphStart,
+                        amd::Command*& graphEnd, hip::Stream* stream);
 void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* stream,
                  GraphExec* ptr);

@@ -552,27 +553,26 @@ struct GraphExec {
  // Topological order of the graph doesn't include nodes embedded as part of the child graph
  std::vector<Node> topoOrder_;
  std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
+  struct Graph* clonedGraph_;
  std::vector<hip::Stream*> parallel_streams_;
  uint currentQueueIndex_;
  std::unordered_map<Node, Node> clonedNodes_;
  amd::Command* lastEnqueuedCommand_;
  static std::unordered_set<GraphExec*> graphExecSet_;
-  std::unordered_set<UserObject*> graphExeUserObj_;
  static amd::Monitor graphExecSetLock_;
  uint64_t flags_ = 0;
  bool repeatLaunch_ = false;
+
 public:
  GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
-               std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
-               std::unordered_map<Node, Node>& clonedNodes,
-               std::unordered_set<UserObject*>& userObjs,
-               uint64_t flags = 0)
+            std::unordered_map<Node, std::vector<Node>>& nodeWaitLists, struct Graph*& clonedGraph,
+            std::unordered_map<Node, Node>& clonedNodes, uint64_t flags = 0)
      : parallelLists_(lists),
        topoOrder_(topoOrder),
        nodeWaitLists_(nodeWaitLists),
+        clonedGraph_(clonedGraph),
        clonedNodes_(clonedNodes),
        lastEnqueuedCommand_(nullptr),
-        graphExeUserObj_(userObjs),
        currentQueueIndex_(0),
        flags_(flags) {
    amd::ScopedLock lock(graphExecSetLock_);
@@ -587,12 +587,9 @@ struct GraphExec {
        hip::Stream::Destroy(stream);
      }
    }
-    for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second;
    amd::ScopedLock lock(graphExecSetLock_);
-    for (auto userobj : graphExeUserObj_) {
-      userobj->release();
-    }
    graphExecSet_.erase(this);
+    delete clonedGraph_;
  }

  Node GetClonedNode(Node node) {
@@ -623,11 +620,14 @@ struct ChildGraphNode : public GraphNode {
  std::vector<std::vector<Node>> parallelLists_;
  std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
  amd::Command* lastEnqueuedCommand_;
-
+  amd::Command* startCommand_;
+  amd::Command* endCommand_;
 public:
  ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle") {
    childGraph_ = g->clone();
    lastEnqueuedCommand_ = nullptr;
+    startCommand_ = nullptr;
+    endCommand_ = nullptr;
  }

  ~ChildGraphNode() { delete childGraph_; }
@@ -672,44 +672,41 @@ struct ChildGraphNode : public GraphNode {
    if (status != hipSuccess) {
      return status;
    }
-    commands_.reserve(2);
-    std::vector<amd::Command*> rootCommands;
-    amd::Command* endCommand = nullptr;
-    status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, rootCommands,
-                          endCommand, stream);
-    for (auto& cmd : rootCommands) {
-      commands_.push_back(cmd);
-    }
-    if (endCommand != nullptr) {
-      commands_.push_back(endCommand);
-    }
+    startCommand_ = nullptr;
+    endCommand_ = nullptr;
+    status = FillCommands(parallelLists_, nodeWaitLists_, childGraphNodeOrder_, childGraph_,
+                          startCommand_, endCommand_, stream);
    return status;
  }

  //
  void UpdateEventWaitLists(amd::Command::EventWaitList waitList) {
-    parallelLists_[0].front()->UpdateEventWaitLists(waitList);
+    if (startCommand_ != nullptr) {
+      startCommand_->updateEventWaitList(waitList);
+    }
  }

  void GetRunList(std::vector<std::vector<Node>>& parallelList,
                  std::unordered_map<Node, std::vector<Node>>& dependencies) {
    childGraph_->GetRunList(parallelLists_, nodeWaitLists_);
  }
-  bool TopologicalOrder(std::vector<Node>& TopoOrder) { return childGraph_->TopologicalOrder(TopoOrder); }
+  bool TopologicalOrder(std::vector<Node>& TopoOrder) {
+    return childGraph_->TopologicalOrder(TopoOrder);
+  }
  void EnqueueCommands(hipStream_t stream) {
    // enqueue child graph start command
-    if (commands_.size() == 1) {
-      commands_[0]->enqueue();
-      commands_[0]->release();
+    if (startCommand_ != nullptr) {
+      startCommand_->enqueue();
+      startCommand_->release();
    }
    // enqueue nodes in child graph in level order
    for (auto& node : childGraphNodeOrder_) {
      node->EnqueueCommands(stream);
    }
    // enqueue child graph end command
-    if (commands_.size() == 2) {
-      commands_[1]->enqueue();
-      commands_[1]->release();
+    if (endCommand_ != nullptr) {
+      endCommand_->enqueue();
+      endCommand_->release();
    }
  }