SWDEV-469422 - Refacor childgraph node

Remove static functions in graph

Change-Id: I4df94915f81f250acaea60398aea32ef0ed658e2
Этот коммит содержится в:
Anusha GodavarthySurya
2024-12-03 13:29:50 +00:00
коммит произвёл Anusha Godavarthy Surya
родитель c8afd7109d
Коммит 28cbf2bc4f
3 изменённых файлов: 78 добавлений и 68 удалений
+3 -2
Просмотреть файл
@@ -1864,8 +1864,9 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
reinterpret_cast<hip::ChildGraphNode*>(clonedNode)->TopologicalOrder(childGraphNodes);
for (std::vector<hip::GraphNode*>::size_type i = 0; i != childGraphNodes.size(); i++) {
if (childGraphNodes[i]->GraphCaptureEnabled()) {
status = reinterpret_cast<hip::GraphExec*>(hGraphExec)
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
status = reinterpret_cast<hip::ChildGraphNode*>(clonedNode)
->graphExec_.UpdateAQLPacket(
reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
if (status != hipSuccess) {
return status;
}
+45 -32
Просмотреть файл
@@ -62,7 +62,8 @@ std::unordered_set<Graph*> Graph::graphSet_;
amd::Monitor Graph::graphSetLock_{};
std::unordered_set<GraphExec*> GraphExec::graphExecSet_;
// Guards global exec graph set
amd::Monitor GraphExec::graphExecSetLock_{};
// we have graphExec object as part of child graph and we need recursive lock
amd::Monitor GraphExec::graphExecSetLock_(true);
std::unordered_set<UserObject*> UserObject::ObjectSet_;
// Guards global user object
amd::Monitor UserObject::UserObjectLock_{};
@@ -93,6 +94,7 @@ hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t
return hipSuccess;
}
// ================================================================================================
hipError_t GraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) {
hipError_t status;
status = ihipMemcpy3D_validate(pNodeParams);
@@ -108,6 +110,7 @@ hipError_t GraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams)
return hipSuccess;
}
// ================================================================================================
bool Graph::isGraphValid(Graph* pGraph) {
amd::ScopedLock lock(graphSetLock_);
if (graphSet_.find(pGraph) == graphSet_.end()) {
@@ -116,6 +119,7 @@ bool Graph::isGraphValid(Graph* pGraph) {
return true;
}
// ================================================================================================
void Graph::AddNode(const Node& node) {
vertices_.emplace_back(node);
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)",
@@ -123,11 +127,13 @@ void Graph::AddNode(const Node& node) {
node->SetParentGraph(this);
}
// ================================================================================================
void Graph::RemoveNode(const Node& node) {
vertices_.erase(std::remove(vertices_.begin(), vertices_.end(), node), vertices_.end());
delete node;
}
// ================================================================================================
// root nodes are all vertices with 0 in-degrees
std::vector<Node> Graph::GetRootNodes() const {
std::vector<Node> roots;
@@ -141,6 +147,7 @@ std::vector<Node> Graph::GetRootNodes() const {
return roots;
}
// ================================================================================================
// leaf nodes are all vertices with 0 out-degrees
std::vector<Node> Graph::GetLeafNodes() const {
std::vector<Node> leafNodes;
@@ -152,6 +159,7 @@ std::vector<Node> Graph::GetLeafNodes() const {
return leafNodes;
}
// ================================================================================================
size_t Graph::GetLeafNodeCount() const {
int numLeafNodes = 0;
for (auto entry : vertices_) {
@@ -181,7 +189,7 @@ void Graph::ScheduleOneNode(Node node, int stream_id) {
// Process child graph separately, since, there is no connection
if (node->GetType() == hipGraphNodeTypeGraph) {
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->childGraph_;
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->GetChildGraph();
child->ScheduleNodes();
max_streams_ = std::max(max_streams_, child->max_streams_);
if (child->max_streams_ == 1) {
@@ -260,6 +268,7 @@ bool Graph::TopologicalOrder(std::vector<Node>& TopoOrder) {
return false;
}
// ================================================================================================
Graph* Graph::clone(std::unordered_map<Node, Node>& clonedNodes) const {
Graph* newGraph = new Graph(device_, this);
for (auto entry : vertices_) {
@@ -301,11 +310,13 @@ Graph* Graph::clone(std::unordered_map<Node, Node>& clonedNodes) const {
return newGraph;
}
// ================================================================================================
Graph* Graph::clone() const {
std::unordered_map<Node, Node> clonedNodes;
return clone(clonedNodes);
}
// ================================================================================================
bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) {
amd::ScopedLock lock(graphExecSetLock_);
if (graphExecSet_.find(pGraphExec) == graphExecSet_.end()) {
@@ -314,6 +325,7 @@ bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) {
return true;
}
// ================================================================================================
hipError_t GraphExec::CreateStreams(uint32_t num_streams) {
parallel_streams_.reserve(num_streams);
for (uint32_t i = 0; i < num_streams; ++i) {
@@ -353,44 +365,46 @@ hipError_t GraphExec::Init() {
//! Chunk size to add to kern arg pool
constexpr uint32_t kKernArgChunkSize = 128 * Ki;
// ================================================================================================
void GetKernelArgSizeForGraph(std::vector<hip::Node>& topoOrder,
size_t& kernArgSizeForGraph) {
void GraphExec::GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph) {
// GPU packet capture is enabled for kernel nodes. Calculate the kernel
// arg size required for all graph kernel nodes to allocate
for (hip::GraphNode* node : topoOrder) {
for (hip::GraphNode* node : topoOrder_) {
if (node->GraphCaptureEnabled()) {
kernArgSizeForGraph += node->GetKerArgSize();
} else if (node->GetType() == hipGraphNodeTypeGraph) {
if (reinterpret_cast<hip::ChildGraphNode*>(node)->childGraph_->max_streams_ == 1) {
GetKernelArgSizeForGraph(reinterpret_cast<hip::ChildGraphNode*>(node)->childGraphNodeOrder_,
kernArgSizeForGraph);
auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
// Child graph shares same kernel arg manager
GraphKernelArgManager* KernelArgManager = GetKernelArgManager();
KernelArgManager->retain();
childNode->graphExec_.SetKernelArgManager(KernelArgManager);
// Set capture stream for child graph
childNode->graphExec_.capture_stream_ = capture_stream_;
if (childNode->GetChildGraph()->max_streams_ == 1) {
childNode->graphExec_.GetKernelArgSizeForGraph(kernArgSizeForGraph);
}
}
}
}
// ================================================================================================
hipError_t AllocKernelArgForGraphNode(std::vector<hip::Node>& topoOrder,
hip::Stream* capture_stream, hip::GraphExec* graphExec) {
hipError_t GraphExec::AllocKernelArgForGraphNode() {
hipError_t status = hipSuccess;
for (auto& node : topoOrder) {
for (auto& node : topoOrder_) {
if (node->GetType() == hipGraphNodeTypeKernel) {
// Check if graph requires hidden heap and set as part of graphExec param.
static bool initialized = false;
if (!initialized && reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
graphExec->SetHiddenHeap();
SetHiddenHeap();
initialized = true;
}
}
if (node->GraphCaptureEnabled()) {
node->CaptureAndFormPacket(capture_stream, graphExec->GetKernelArgManager());
node->CaptureAndFormPacket(capture_stream_, GetKernelArgManager());
} else if (node->GetType() == hipGraphNodeTypeGraph) {
auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
if (childNode->childGraph_->max_streams_ == 1) {
if (childNode->GetChildGraph()->max_streams_ == 1) {
childNode->SetGraphCaptureStatus(true);
status =
AllocKernelArgForGraphNode(childNode->GetChildGraphNodeOrder(),
capture_stream, graphExec);
status = childNode->graphExec_.AllocKernelArgForGraphNode();
if (status != hipSuccess) {
return status;
}
@@ -405,7 +419,7 @@ hipError_t GraphExec::CaptureAQLPackets() {
hipError_t status = hipSuccess;
if (clonedGraph_->max_streams_ == 1) {
size_t kernArgSizeForGraph = 0;
GetKernelArgSizeForGraph(topoOrder_, kernArgSizeForGraph);
GetKernelArgSizeForGraph(kernArgSizeForGraph);
auto device = g_devices[ihipGetDevice()]->devices()[0];
// Add a larger initial pool to accomodate for any updates to kernel args
bool bStatus = kernArgManager_->AllocGraphKernargPool(kernArgSizeForGraph + kKernArgChunkSize);
@@ -413,7 +427,7 @@ hipError_t GraphExec::CaptureAQLPackets() {
return hipErrorMemoryAllocation;
}
status = AllocKernelArgForGraphNode(topoOrder_, capture_stream_, this);
status = AllocKernelArgForGraphNode();
if (status != hipSuccess) {
return status;
}
@@ -440,8 +454,7 @@ void GraphExec::DecrementRefCount(cl_event event, cl_int command_exec_status, vo
// ================================================================================================
hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::Stream* hip_stream,
hip::GraphExec* graphExec) {
hipError_t GraphExec::EnqueueGraphWithSingleList(hip::Stream* hip_stream) {
// Accumulate command tracks all the AQL packet batch that we submit to the HW. For now
// we track only kernel nodes.
amd::AccumulateCommand* accumulate = nullptr;
@@ -449,18 +462,18 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
}
for (int i = 0; i < topoOrder.size(); i++) {
if (topoOrder[i]->GraphCaptureEnabled()) {
if (topoOrder[i]->GetEnabled()) {
std::vector<uint8_t*>& gpuPackets = topoOrder[i]->GetAqlPackets();
for (int i = 0; i < topoOrder_.size(); i++) {
if (topoOrder_[i]->GraphCaptureEnabled()) {
if (topoOrder_[i]->GetEnabled()) {
std::vector<uint8_t*>& gpuPackets = topoOrder_[i]->GetAqlPackets();
for (auto& packet : gpuPackets) {
hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder[i]->GetKernelName(), accumulate);
hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder_[i]->GetKernelName(), accumulate);
}
}
} else {
topoOrder[i]->SetStream(hip_stream, graphExec);
status = topoOrder[i]->CreateCommand(topoOrder[i]->GetQueue());
topoOrder[i]->EnqueueCommands(hip_stream);
topoOrder_[i]->SetStream(hip_stream);
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
topoOrder_[i]->EnqueueCommands(hip_stream);
}
}
@@ -530,7 +543,7 @@ bool Graph::RunOneNode(Node node, bool wait) {
}
if (node->GetType() == hipGraphNodeTypeGraph) {
// Process child graph separately, since, there is no connection
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->childGraph_;
auto child = reinterpret_cast<hip::ChildGraphNode*>(node)->GetChildGraph();
if (!reinterpret_cast<hip::ChildGraphNode*>(node)->graphCaptureStatus_) {
child->RunNodes(node->stream_id_, &streams_, &waitList);
}
@@ -694,10 +707,10 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
initialized = true;
}
}
status = EnqueueGraphWithSingleList(topoOrder_, launch_stream, this);
status = EnqueueGraphWithSingleList(launch_stream);
} else if (clonedGraph_->max_streams_ == 1 && instantiateDeviceId_ != launch_stream->DeviceId()) {
for (int i = 0; i < topoOrder_.size(); i++) {
topoOrder_[i]->SetStream(launch_stream, this);
topoOrder_[i]->SetStream(launch_stream);
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
topoOrder_[i]->EnqueueCommands(launch_stream);
}
+30 -34
Просмотреть файл
@@ -46,8 +46,6 @@ struct GraphNode;
struct GraphExec;
struct UserObject;
typedef GraphNode* Node;
hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::Stream* hip_stream,
hip::GraphExec* graphExec = nullptr);
struct UserObject : public amd::ReferenceCountedObject {
typedef void (*UserCallbackDestructor)(void* data);
static std::unordered_set<UserObject*> ObjectSet_;
@@ -286,7 +284,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
}
hip::Stream* GetQueue() const { return stream_; }
virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
virtual void SetStream(hip::Stream* stream) {
stream_ = stream;
}
//! Updates the grpah node with the execution stream
@@ -734,7 +732,6 @@ struct GraphExec : public amd::ReferenceCountedObject {
hip::Stream* capture_stream_;
uint currentQueueIndex_;
std::unordered_map<Node, Node> clonedNodes_;
amd::Command* lastEnqueuedCommand_;
static std::unordered_set<GraphExec*> graphExecSet_;
static amd::Monitor graphExecSetLock_;
uint64_t flags_ = 0;
@@ -750,13 +747,17 @@ struct GraphExec : public amd::ReferenceCountedObject {
topoOrder_(topoOrder),
clonedGraph_(clonedGraph),
clonedNodes_(clonedNodes),
lastEnqueuedCommand_(nullptr),
currentQueueIndex_(0),
flags_(flags) {
amd::ScopedLock lock(graphExecSetLock_);
graphExecSet_.insert(this);
}
GraphExec() : ReferenceCountedObject() {
amd::ScopedLock lock(graphExecSetLock_);
graphExecSet_.insert(this);
}
~GraphExec() {
for (auto stream : parallel_streams_) {
if (stream != nullptr) {
@@ -768,7 +769,9 @@ struct GraphExec : public amd::ReferenceCountedObject {
graphExecSet_.erase(this);
delete clonedGraph_;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
kernArgManager_->release();
if (kernArgManager_ != nullptr) {
kernArgManager_->release();
}
}
}
@@ -816,28 +819,22 @@ struct GraphExec : public amd::ReferenceCountedObject {
return kernArgManager_;
}
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
hipError_t AllocKernelArgForGraphNode();
void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph);
hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
};
struct ChildGraphNode : public GraphNode {
struct Graph* childGraph_;
std::vector<Node> childGraphNodeOrder_;
amd::Command* lastEnqueuedCommand_;
amd::Command* startCommand_;
amd::Command* endCommand_;
struct GraphExec graphExec_;
bool graphCaptureStatus_;
public:
ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle") {
childGraph_ = g->clone();
lastEnqueuedCommand_ = nullptr;
startCommand_ = nullptr;
endCommand_ = nullptr;
graphExec_.clonedGraph_ = g->clone();
graphCaptureStatus_ = false;
}
~ChildGraphNode() { delete childGraph_; }
ChildGraphNode(const ChildGraphNode& rhs) : GraphNode(rhs) {
childGraph_ = rhs.childGraph_->clone();
graphExec_.clonedGraph_ = rhs.graphExec_.clonedGraph_->clone();
graphCaptureStatus_ = rhs.graphCaptureStatus_;
}
@@ -845,43 +842,42 @@ struct ChildGraphNode : public GraphNode {
return new ChildGraphNode(static_cast<ChildGraphNode const&>(*this));
}
Graph* GetChildGraph() override { return childGraph_; }
Graph* GetChildGraph() override { return graphExec_.clonedGraph_; }
void SetGraphCaptureStatus(bool status) { graphCaptureStatus_ = status; }
bool GetGraphCaptureStatus() { return graphCaptureStatus_; }
std::vector<Node>& GetChildGraphNodeOrder() {
return childGraphNodeOrder_;
return graphExec_.topoOrder_;
}
void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) override {
void SetStream(hip::Stream* stream) override {
stream_ = stream;
}
bool TopologicalOrder(std::vector<Node>& TopoOrder) override {
return childGraph_->TopologicalOrder(TopoOrder);
return graphExec_.clonedGraph_->TopologicalOrder(TopoOrder);
}
bool TopologicalOrder() { return childGraph_->TopologicalOrder(childGraphNodeOrder_); }
bool TopologicalOrder() { return graphExec_.clonedGraph_->TopologicalOrder(graphExec_.topoOrder_); }
void EnqueueCommands(hip::Stream* stream) override {
if (graphCaptureStatus_) {
hipError_t status =
EnqueueGraphWithSingleList(childGraphNodeOrder_, stream);
} else if (childGraph_->max_streams_ == 1) {
for (int i = 0; i < childGraphNodeOrder_.size(); i++) {
childGraphNodeOrder_[i]->SetStream(stream_);
hipError_t status = graphExec_.EnqueueGraphWithSingleList(stream);
} else if (graphExec_.clonedGraph_->max_streams_ == 1) {
for (int i = 0; i < graphExec_.topoOrder_.size(); i++) {
graphExec_.topoOrder_[i]->SetStream(stream_);
hipError_t status =
childGraphNodeOrder_[i]->CreateCommand(childGraphNodeOrder_[i]->GetQueue());
childGraphNodeOrder_[i]->EnqueueCommands(stream_);
graphExec_.topoOrder_[i]->CreateCommand(graphExec_.topoOrder_[i]->GetQueue());
graphExec_.topoOrder_[i]->EnqueueCommands(stream_);
}
}
}
hipError_t SetParams(const Graph* childGraph) {
const std::vector<Node>& newNodes = childGraph->GetNodes();
const std::vector<Node>& oldNodes = childGraph_->GetNodes();
const std::vector<Node>& oldNodes = graphExec_.clonedGraph_->GetNodes();
for (std::vector<Node>::size_type i = 0; i != newNodes.size(); i++) {
hipError_t status = oldNodes[i]->SetParams(newNodes[i]);
if (status != hipSuccess) {
@@ -893,15 +889,15 @@ struct ChildGraphNode : public GraphNode {
hipError_t SetParams(GraphNode* node) override {
const ChildGraphNode* childGraphNode = static_cast<ChildGraphNode const*>(node);
return SetParams(childGraphNode->childGraph_);
return SetParams(childGraphNode->graphExec_.clonedGraph_);
}
virtual std::string GetLabel(hipGraphDebugDotFlags flag) override {
return std::to_string(GetID()) + "\n" + "graph_" + std::to_string(childGraph_->GetID());
return std::to_string(GetID()) + "\n" + "graph_" + std::to_string(graphExec_.clonedGraph_->GetID());
}
virtual void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) override {
childGraph_->GenerateDOT(fout, flag);
graphExec_.clonedGraph_->GenerateDOT(fout, flag);
}
};