diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp index 869d60c210..b608c03d7c 100644 --- a/hipamd/src/hip_graph.cpp +++ b/hipamd/src/hip_graph.cpp @@ -1864,8 +1864,9 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra reinterpret_cast(clonedNode)->TopologicalOrder(childGraphNodes); for (std::vector::size_type i = 0; i != childGraphNodes.size(); i++) { if (childGraphNodes[i]->GraphCaptureEnabled()) { - status = reinterpret_cast(hGraphExec) - ->UpdateAQLPacket(reinterpret_cast(childGraphNodes[i])); + status = reinterpret_cast(clonedNode) + ->graphExec_.UpdateAQLPacket( + reinterpret_cast(childGraphNodes[i])); if (status != hipSuccess) { return status; } diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index 9735519cb0..4fa163655e 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -62,7 +62,8 @@ std::unordered_set Graph::graphSet_; amd::Monitor Graph::graphSetLock_{}; std::unordered_set GraphExec::graphExecSet_; // Guards global exec graph set -amd::Monitor GraphExec::graphExecSetLock_{}; +// we have graphExec object as part of child graph and we need recursive lock +amd::Monitor GraphExec::graphExecSetLock_(true); std::unordered_set UserObject::ObjectSet_; // Guards global user object amd::Monitor UserObject::UserObjectLock_{}; @@ -93,6 +94,7 @@ hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t return hipSuccess; } +// ================================================================================================ hipError_t GraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) { hipError_t status; status = ihipMemcpy3D_validate(pNodeParams); @@ -108,6 +110,7 @@ hipError_t GraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) return hipSuccess; } +// ================================================================================================ bool Graph::isGraphValid(Graph* pGraph) { amd::ScopedLock lock(graphSetLock_); if (graphSet_.find(pGraph) == graphSet_.end()) { @@ -116,6 +119,7 @@ bool Graph::isGraphValid(Graph* pGraph) { return true; } +// ================================================================================================ void Graph::AddNode(const Node& node) { vertices_.emplace_back(node); ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)", @@ -123,11 +127,13 @@ void Graph::AddNode(const Node& node) { node->SetParentGraph(this); } +// ================================================================================================ void Graph::RemoveNode(const Node& node) { vertices_.erase(std::remove(vertices_.begin(), vertices_.end(), node), vertices_.end()); delete node; } +// ================================================================================================ // root nodes are all vertices with 0 in-degrees std::vector Graph::GetRootNodes() const { std::vector roots; @@ -141,6 +147,7 @@ std::vector Graph::GetRootNodes() const { return roots; } +// ================================================================================================ // leaf nodes are all vertices with 0 out-degrees std::vector Graph::GetLeafNodes() const { std::vector leafNodes; @@ -152,6 +159,7 @@ std::vector Graph::GetLeafNodes() const { return leafNodes; } +// ================================================================================================ size_t Graph::GetLeafNodeCount() const { int numLeafNodes = 0; for (auto entry : vertices_) { @@ -181,7 +189,7 @@ void Graph::ScheduleOneNode(Node node, int stream_id) { // Process child graph separately, since, there is no connection if (node->GetType() == hipGraphNodeTypeGraph) { - auto child = reinterpret_cast(node)->childGraph_; + auto child = reinterpret_cast(node)->GetChildGraph(); child->ScheduleNodes(); max_streams_ = std::max(max_streams_, child->max_streams_); if (child->max_streams_ == 1) { @@ -260,6 +268,7 @@ bool Graph::TopologicalOrder(std::vector& TopoOrder) { return false; } +// ================================================================================================ Graph* Graph::clone(std::unordered_map& clonedNodes) const { Graph* newGraph = new Graph(device_, this); for (auto entry : vertices_) { @@ -301,11 +310,13 @@ Graph* Graph::clone(std::unordered_map& clonedNodes) const { return newGraph; } +// ================================================================================================ Graph* Graph::clone() const { std::unordered_map clonedNodes; return clone(clonedNodes); } +// ================================================================================================ bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) { amd::ScopedLock lock(graphExecSetLock_); if (graphExecSet_.find(pGraphExec) == graphExecSet_.end()) { @@ -314,6 +325,7 @@ bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) { return true; } +// ================================================================================================ hipError_t GraphExec::CreateStreams(uint32_t num_streams) { parallel_streams_.reserve(num_streams); for (uint32_t i = 0; i < num_streams; ++i) { @@ -353,44 +365,46 @@ hipError_t GraphExec::Init() { //! Chunk size to add to kern arg pool constexpr uint32_t kKernArgChunkSize = 128 * Ki; // ================================================================================================ -void GetKernelArgSizeForGraph(std::vector& topoOrder, - size_t& kernArgSizeForGraph) { +void GraphExec::GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph) { // GPU packet capture is enabled for kernel nodes. Calculate the kernel // arg size required for all graph kernel nodes to allocate - for (hip::GraphNode* node : topoOrder) { + for (hip::GraphNode* node : topoOrder_) { if (node->GraphCaptureEnabled()) { kernArgSizeForGraph += node->GetKerArgSize(); } else if (node->GetType() == hipGraphNodeTypeGraph) { - if (reinterpret_cast(node)->childGraph_->max_streams_ == 1) { - GetKernelArgSizeForGraph(reinterpret_cast(node)->childGraphNodeOrder_, - kernArgSizeForGraph); + auto childNode = reinterpret_cast(node); + // Child graph shares same kernel arg manager + GraphKernelArgManager* KernelArgManager = GetKernelArgManager(); + KernelArgManager->retain(); + childNode->graphExec_.SetKernelArgManager(KernelArgManager); + // Set capture stream for child graph + childNode->graphExec_.capture_stream_ = capture_stream_; + if (childNode->GetChildGraph()->max_streams_ == 1) { + childNode->graphExec_.GetKernelArgSizeForGraph(kernArgSizeForGraph); } } } } // ================================================================================================ -hipError_t AllocKernelArgForGraphNode(std::vector& topoOrder, - hip::Stream* capture_stream, hip::GraphExec* graphExec) { +hipError_t GraphExec::AllocKernelArgForGraphNode() { hipError_t status = hipSuccess; - for (auto& node : topoOrder) { + for (auto& node : topoOrder_) { if (node->GetType() == hipGraphNodeTypeKernel) { // Check if graph requires hidden heap and set as part of graphExec param. static bool initialized = false; if (!initialized && reinterpret_cast(node)->HasHiddenHeap()) { - graphExec->SetHiddenHeap(); + SetHiddenHeap(); initialized = true; } } if (node->GraphCaptureEnabled()) { - node->CaptureAndFormPacket(capture_stream, graphExec->GetKernelArgManager()); + node->CaptureAndFormPacket(capture_stream_, GetKernelArgManager()); } else if (node->GetType() == hipGraphNodeTypeGraph) { auto childNode = reinterpret_cast(node); - if (childNode->childGraph_->max_streams_ == 1) { + if (childNode->GetChildGraph()->max_streams_ == 1) { childNode->SetGraphCaptureStatus(true); - status = - AllocKernelArgForGraphNode(childNode->GetChildGraphNodeOrder(), - capture_stream, graphExec); + status = childNode->graphExec_.AllocKernelArgForGraphNode(); if (status != hipSuccess) { return status; } @@ -405,7 +419,7 @@ hipError_t GraphExec::CaptureAQLPackets() { hipError_t status = hipSuccess; if (clonedGraph_->max_streams_ == 1) { size_t kernArgSizeForGraph = 0; - GetKernelArgSizeForGraph(topoOrder_, kernArgSizeForGraph); + GetKernelArgSizeForGraph(kernArgSizeForGraph); auto device = g_devices[ihipGetDevice()]->devices()[0]; // Add a larger initial pool to accomodate for any updates to kernel args bool bStatus = kernArgManager_->AllocGraphKernargPool(kernArgSizeForGraph + kKernArgChunkSize); @@ -413,7 +427,7 @@ hipError_t GraphExec::CaptureAQLPackets() { return hipErrorMemoryAllocation; } - status = AllocKernelArgForGraphNode(topoOrder_, capture_stream_, this); + status = AllocKernelArgForGraphNode(); if (status != hipSuccess) { return status; } @@ -440,8 +454,7 @@ void GraphExec::DecrementRefCount(cl_event event, cl_int command_exec_status, vo // ================================================================================================ -hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::Stream* hip_stream, - hip::GraphExec* graphExec) { +hipError_t GraphExec::EnqueueGraphWithSingleList(hip::Stream* hip_stream) { // Accumulate command tracks all the AQL packet batch that we submit to the HW. For now // we track only kernel nodes. amd::AccumulateCommand* accumulate = nullptr; @@ -449,18 +462,18 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr); } - for (int i = 0; i < topoOrder.size(); i++) { - if (topoOrder[i]->GraphCaptureEnabled()) { - if (topoOrder[i]->GetEnabled()) { - std::vector& gpuPackets = topoOrder[i]->GetAqlPackets(); + for (int i = 0; i < topoOrder_.size(); i++) { + if (topoOrder_[i]->GraphCaptureEnabled()) { + if (topoOrder_[i]->GetEnabled()) { + std::vector& gpuPackets = topoOrder_[i]->GetAqlPackets(); for (auto& packet : gpuPackets) { - hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder[i]->GetKernelName(), accumulate); + hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder_[i]->GetKernelName(), accumulate); } } } else { - topoOrder[i]->SetStream(hip_stream, graphExec); - status = topoOrder[i]->CreateCommand(topoOrder[i]->GetQueue()); - topoOrder[i]->EnqueueCommands(hip_stream); + topoOrder_[i]->SetStream(hip_stream); + status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue()); + topoOrder_[i]->EnqueueCommands(hip_stream); } } @@ -530,7 +543,7 @@ bool Graph::RunOneNode(Node node, bool wait) { } if (node->GetType() == hipGraphNodeTypeGraph) { // Process child graph separately, since, there is no connection - auto child = reinterpret_cast(node)->childGraph_; + auto child = reinterpret_cast(node)->GetChildGraph(); if (!reinterpret_cast(node)->graphCaptureStatus_) { child->RunNodes(node->stream_id_, &streams_, &waitList); } @@ -694,10 +707,10 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) { initialized = true; } } - status = EnqueueGraphWithSingleList(topoOrder_, launch_stream, this); + status = EnqueueGraphWithSingleList(launch_stream); } else if (clonedGraph_->max_streams_ == 1 && instantiateDeviceId_ != launch_stream->DeviceId()) { for (int i = 0; i < topoOrder_.size(); i++) { - topoOrder_[i]->SetStream(launch_stream, this); + topoOrder_[i]->SetStream(launch_stream); status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue()); topoOrder_[i]->EnqueueCommands(launch_stream); } diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index 7f2c7c5129..46695cd555 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -46,8 +46,6 @@ struct GraphNode; struct GraphExec; struct UserObject; typedef GraphNode* Node; -hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::Stream* hip_stream, - hip::GraphExec* graphExec = nullptr); struct UserObject : public amd::ReferenceCountedObject { typedef void (*UserCallbackDestructor)(void* data); static std::unordered_set ObjectSet_; @@ -286,7 +284,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute { } hip::Stream* GetQueue() const { return stream_; } - virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) { + virtual void SetStream(hip::Stream* stream) { stream_ = stream; } //! Updates the grpah node with the execution stream @@ -734,7 +732,6 @@ struct GraphExec : public amd::ReferenceCountedObject { hip::Stream* capture_stream_; uint currentQueueIndex_; std::unordered_map clonedNodes_; - amd::Command* lastEnqueuedCommand_; static std::unordered_set graphExecSet_; static amd::Monitor graphExecSetLock_; uint64_t flags_ = 0; @@ -750,13 +747,17 @@ struct GraphExec : public amd::ReferenceCountedObject { topoOrder_(topoOrder), clonedGraph_(clonedGraph), clonedNodes_(clonedNodes), - lastEnqueuedCommand_(nullptr), currentQueueIndex_(0), flags_(flags) { amd::ScopedLock lock(graphExecSetLock_); graphExecSet_.insert(this); } + GraphExec() : ReferenceCountedObject() { + amd::ScopedLock lock(graphExecSetLock_); + graphExecSet_.insert(this); + } + ~GraphExec() { for (auto stream : parallel_streams_) { if (stream != nullptr) { @@ -768,7 +769,9 @@ struct GraphExec : public amd::ReferenceCountedObject { graphExecSet_.erase(this); delete clonedGraph_; if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { - kernArgManager_->release(); + if (kernArgManager_ != nullptr) { + kernArgManager_->release(); + } } } @@ -816,28 +819,22 @@ struct GraphExec : public amd::ReferenceCountedObject { return kernArgManager_; } static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data); + hipError_t AllocKernelArgForGraphNode(); + void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph); + hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream); }; struct ChildGraphNode : public GraphNode { - struct Graph* childGraph_; - std::vector childGraphNodeOrder_; - amd::Command* lastEnqueuedCommand_; - amd::Command* startCommand_; - amd::Command* endCommand_; + struct GraphExec graphExec_; bool graphCaptureStatus_; public: ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle") { - childGraph_ = g->clone(); - lastEnqueuedCommand_ = nullptr; - startCommand_ = nullptr; - endCommand_ = nullptr; + graphExec_.clonedGraph_ = g->clone(); graphCaptureStatus_ = false; } - ~ChildGraphNode() { delete childGraph_; } - ChildGraphNode(const ChildGraphNode& rhs) : GraphNode(rhs) { - childGraph_ = rhs.childGraph_->clone(); + graphExec_.clonedGraph_ = rhs.graphExec_.clonedGraph_->clone(); graphCaptureStatus_ = rhs.graphCaptureStatus_; } @@ -845,43 +842,42 @@ struct ChildGraphNode : public GraphNode { return new ChildGraphNode(static_cast(*this)); } - Graph* GetChildGraph() override { return childGraph_; } + Graph* GetChildGraph() override { return graphExec_.clonedGraph_; } void SetGraphCaptureStatus(bool status) { graphCaptureStatus_ = status; } bool GetGraphCaptureStatus() { return graphCaptureStatus_; } std::vector& GetChildGraphNodeOrder() { - return childGraphNodeOrder_; + return graphExec_.topoOrder_; } - void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) override { + void SetStream(hip::Stream* stream) override { stream_ = stream; } bool TopologicalOrder(std::vector& TopoOrder) override { - return childGraph_->TopologicalOrder(TopoOrder); + return graphExec_.clonedGraph_->TopologicalOrder(TopoOrder); } - bool TopologicalOrder() { return childGraph_->TopologicalOrder(childGraphNodeOrder_); } + bool TopologicalOrder() { return graphExec_.clonedGraph_->TopologicalOrder(graphExec_.topoOrder_); } void EnqueueCommands(hip::Stream* stream) override { if (graphCaptureStatus_) { - hipError_t status = - EnqueueGraphWithSingleList(childGraphNodeOrder_, stream); - } else if (childGraph_->max_streams_ == 1) { - for (int i = 0; i < childGraphNodeOrder_.size(); i++) { - childGraphNodeOrder_[i]->SetStream(stream_); + hipError_t status = graphExec_.EnqueueGraphWithSingleList(stream); + } else if (graphExec_.clonedGraph_->max_streams_ == 1) { + for (int i = 0; i < graphExec_.topoOrder_.size(); i++) { + graphExec_.topoOrder_[i]->SetStream(stream_); hipError_t status = - childGraphNodeOrder_[i]->CreateCommand(childGraphNodeOrder_[i]->GetQueue()); - childGraphNodeOrder_[i]->EnqueueCommands(stream_); + graphExec_.topoOrder_[i]->CreateCommand(graphExec_.topoOrder_[i]->GetQueue()); + graphExec_.topoOrder_[i]->EnqueueCommands(stream_); } } } hipError_t SetParams(const Graph* childGraph) { const std::vector& newNodes = childGraph->GetNodes(); - const std::vector& oldNodes = childGraph_->GetNodes(); + const std::vector& oldNodes = graphExec_.clonedGraph_->GetNodes(); for (std::vector::size_type i = 0; i != newNodes.size(); i++) { hipError_t status = oldNodes[i]->SetParams(newNodes[i]); if (status != hipSuccess) { @@ -893,15 +889,15 @@ struct ChildGraphNode : public GraphNode { hipError_t SetParams(GraphNode* node) override { const ChildGraphNode* childGraphNode = static_cast(node); - return SetParams(childGraphNode->childGraph_); + return SetParams(childGraphNode->graphExec_.clonedGraph_); } virtual std::string GetLabel(hipGraphDebugDotFlags flag) override { - return std::to_string(GetID()) + "\n" + "graph_" + std::to_string(childGraph_->GetID()); + return std::to_string(GetID()) + "\n" + "graph_" + std::to_string(graphExec_.clonedGraph_->GetID()); } virtual void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) override { - childGraph_->GenerateDOT(fout, flag); + graphExec_.clonedGraph_->GenerateDOT(fout, flag); } };