SWDEV-469422 - Derive GraphExec from Graph and ChildGraphNode from GraphExec
Change-Id: I54d67a1665355579bc249d8ff4f9806e9ee14588
This commit is contained in:
committato da
Anusha Godavarthy Surya
parent
3c863dad91
commit
13e2e797c0
+23
-31
@@ -1374,37 +1374,30 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph,
|
||||
}
|
||||
}
|
||||
}
|
||||
std::unordered_map<hip::GraphNode*, hip::GraphNode*> clonedNodes;
|
||||
hip::Graph* clonedGraph = graph->clone(clonedNodes);
|
||||
clonedGraph->memAllocNodePtrs_ = graph->memAllocNodePtrs_;
|
||||
if (clonedGraph == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
std::vector<hip::GraphNode*> graphNodes;
|
||||
clonedGraph->ScheduleNodes();
|
||||
if (false == clonedGraph->TopologicalOrder(graphNodes)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
*pGraphExec = new hip::GraphExec(graphNodes, clonedGraph, clonedNodes, flags);
|
||||
if (*pGraphExec != nullptr) {
|
||||
graph->SetGraphInstantiated(true);
|
||||
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
|
||||
static int i = 1;
|
||||
std::string filename =
|
||||
"graph_" + std::to_string(amd::Os::getProcessId()) + "_dot_print_" + std::to_string(i++);
|
||||
hipError_t status =
|
||||
ihipGraphDebugDotPrint(reinterpret_cast<hipGraph_t>(clonedGraph), filename.c_str(), 0);
|
||||
if (status == hipSuccess) {
|
||||
LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str());
|
||||
}
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
(*pGraphExec)->SetKernelArgManager(new hip::GraphKernelArgManager());
|
||||
}
|
||||
return (*pGraphExec)->Init();
|
||||
} else {
|
||||
*pGraphExec = new hip::GraphExec(flags);
|
||||
if (*pGraphExec == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
graph->clone(*pGraphExec, true);
|
||||
(*pGraphExec)->ScheduleNodes();
|
||||
if (false == (*pGraphExec)->TopologicalOrder()) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
graph->SetGraphInstantiated(true);
|
||||
if (DEBUG_HIP_GRAPH_DOT_PRINT) {
|
||||
static int i = 1;
|
||||
std::string filename =
|
||||
"graph_" + std::to_string(amd::Os::getProcessId()) + "_dot_print_" + std::to_string(i++);
|
||||
hipError_t status =
|
||||
ihipGraphDebugDotPrint(reinterpret_cast<hipGraph_t>(*pGraphExec), filename.c_str(), 0);
|
||||
if (status == hipSuccess) {
|
||||
LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str());
|
||||
}
|
||||
}
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
(*pGraphExec)->SetKernelArgManager(new hip::GraphKernelArgManager());
|
||||
}
|
||||
return (*pGraphExec)->Init();
|
||||
}
|
||||
|
||||
hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
|
||||
@@ -1865,8 +1858,7 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
|
||||
for (std::vector<hip::GraphNode*>::size_type i = 0; i != childGraphNodes.size(); i++) {
|
||||
if (childGraphNodes[i]->GraphCaptureEnabled()) {
|
||||
status = reinterpret_cast<hip::ChildGraphNode*>(clonedNode)
|
||||
->graphExec_.UpdateAQLPacket(
|
||||
reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
|
||||
->UpdateAQLPacket(reinterpret_cast<hip::GraphKernelNode*>(childGraphNodes[i]));
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -193,7 +193,7 @@ void Graph::ScheduleOneNode(Node node, int stream_id) {
|
||||
child->ScheduleNodes();
|
||||
max_streams_ = std::max(max_streams_, child->max_streams_);
|
||||
if (child->max_streams_ == 1) {
|
||||
reinterpret_cast<hip::ChildGraphNode*>(node)->TopologicalOrder();
|
||||
reinterpret_cast<hip::ChildGraphNode*>(node)->GraphExec::TopologicalOrder();
|
||||
}
|
||||
}
|
||||
for (auto edge: node->GetEdges()) {
|
||||
@@ -269,13 +269,13 @@ bool Graph::TopologicalOrder(std::vector<Node>& TopoOrder) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Graph* Graph::clone(std::unordered_map<Node, Node>& clonedNodes) const {
|
||||
Graph* newGraph = new Graph(device_, this);
|
||||
for (auto entry : vertices_) {
|
||||
void Graph::clone(Graph* newGraph, bool cloneNodes) const {
|
||||
newGraph->pOriginalGraph_ = this;
|
||||
for (hip::GraphNode* entry : vertices_) {
|
||||
GraphNode* node = entry->clone();
|
||||
node->SetParentGraph(newGraph);
|
||||
newGraph->vertices_.push_back(node);
|
||||
clonedNodes[entry] = node;
|
||||
newGraph->clonedNodes_[entry] = node;
|
||||
}
|
||||
|
||||
std::vector<Node> clonedEdges;
|
||||
@@ -284,17 +284,17 @@ Graph* Graph::clone(std::unordered_map<Node, Node>& clonedNodes) const {
|
||||
const std::vector<Node>& edges = node->GetEdges();
|
||||
clonedEdges.clear();
|
||||
for (auto edge : edges) {
|
||||
clonedEdges.push_back(clonedNodes[edge]);
|
||||
clonedEdges.push_back(newGraph->clonedNodes_[edge]);
|
||||
}
|
||||
clonedNodes[node]->SetEdges(clonedEdges);
|
||||
newGraph->clonedNodes_[node]->SetEdges(clonedEdges);
|
||||
}
|
||||
for (auto node : vertices_) {
|
||||
const std::vector<Node>& dependencies = node->GetDependencies();
|
||||
clonedDependencies.clear();
|
||||
for (auto dep : dependencies) {
|
||||
clonedDependencies.push_back(clonedNodes[dep]);
|
||||
clonedDependencies.push_back(newGraph->clonedNodes_[dep]);
|
||||
}
|
||||
clonedNodes[node]->SetDependencies(clonedDependencies);
|
||||
newGraph->clonedNodes_[node]->SetDependencies(clonedDependencies);
|
||||
}
|
||||
for (auto& userObj : graphUserObj_) {
|
||||
userObj.first->retain();
|
||||
@@ -307,13 +307,17 @@ Graph* Graph::clone(std::unordered_map<Node, Node>& clonedNodes) const {
|
||||
if (roots_.size() > 0) {
|
||||
memcpy(&newGraph->roots_[0], &roots_[0], sizeof(Node) * roots_.size());
|
||||
}
|
||||
return newGraph;
|
||||
newGraph->memAllocNodePtrs_ = memAllocNodePtrs_;
|
||||
if(!cloneNodes) {
|
||||
newGraph->clonedNodes_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Graph* Graph::clone() const {
|
||||
std::unordered_map<Node, Node> clonedNodes;
|
||||
return clone(clonedNodes);
|
||||
Graph* newGraph = new Graph(device_);
|
||||
clone(newGraph);
|
||||
return newGraph;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
@@ -350,7 +354,7 @@ hipError_t GraphExec::CreateStreams(uint32_t num_streams) {
|
||||
hipError_t GraphExec::Init() {
|
||||
hipError_t status = hipSuccess;
|
||||
// create extra stream to avoid queue collision with the default execution stream
|
||||
status = CreateStreams(clonedGraph_->max_streams_);
|
||||
status = CreateStreams(max_streams_);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
@@ -376,11 +380,11 @@ void GraphExec::GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph) {
|
||||
// Child graph shares same kernel arg manager
|
||||
GraphKernelArgManager* KernelArgManager = GetKernelArgManager();
|
||||
KernelArgManager->retain();
|
||||
childNode->graphExec_.SetKernelArgManager(KernelArgManager);
|
||||
childNode->SetKernelArgManager(KernelArgManager);
|
||||
// Set capture stream for child graph
|
||||
childNode->graphExec_.capture_stream_ = capture_stream_;
|
||||
childNode->capture_stream_ = capture_stream_;
|
||||
if (childNode->GetChildGraph()->max_streams_ == 1) {
|
||||
childNode->graphExec_.GetKernelArgSizeForGraph(kernArgSizeForGraph);
|
||||
childNode->GetKernelArgSizeForGraph(kernArgSizeForGraph);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -404,7 +408,7 @@ hipError_t GraphExec::AllocKernelArgForGraphNode() {
|
||||
auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
|
||||
if (childNode->GetChildGraph()->max_streams_ == 1) {
|
||||
childNode->SetGraphCaptureStatus(true);
|
||||
status = childNode->graphExec_.AllocKernelArgForGraphNode();
|
||||
status = childNode->AllocKernelArgForGraphNode();
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
@@ -417,7 +421,7 @@ hipError_t GraphExec::AllocKernelArgForGraphNode() {
|
||||
// ================================================================================================
|
||||
hipError_t GraphExec::CaptureAQLPackets() {
|
||||
hipError_t status = hipSuccess;
|
||||
if (clonedGraph_->max_streams_ == 1) {
|
||||
if (max_streams_ == 1) {
|
||||
size_t kernArgSizeForGraph = 0;
|
||||
GetKernelArgSizeForGraph(kernArgSizeForGraph);
|
||||
auto device = g_devices[ihipGetDevice()]->devices()[0];
|
||||
@@ -439,7 +443,7 @@ hipError_t GraphExec::CaptureAQLPackets() {
|
||||
// ================================================================================================
|
||||
hipError_t GraphExec::UpdateAQLPacket(hip::GraphNode* node) {
|
||||
hipError_t status = hipSuccess;
|
||||
if (clonedGraph_->max_streams_ == 1) {
|
||||
if (max_streams_ == 1) {
|
||||
node->CaptureAndFormPacket(capture_stream_, kernArgManager_);
|
||||
}
|
||||
return hipSuccess;
|
||||
@@ -696,7 +700,7 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
|
||||
repeatLaunch_ = true;
|
||||
}
|
||||
|
||||
if (clonedGraph_->max_streams_ == 1 && instantiateDeviceId_ == launch_stream->DeviceId()) {
|
||||
if (max_streams_ == 1 && instantiateDeviceId_ == launch_stream->DeviceId()) {
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
// If the graph has kernels that does device side allocation, during packet capture, heap is
|
||||
// allocated because heap pointer has to be added to the AQL packet, and initialized during
|
||||
@@ -708,7 +712,7 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
|
||||
}
|
||||
}
|
||||
status = EnqueueGraphWithSingleList(launch_stream);
|
||||
} else if (clonedGraph_->max_streams_ == 1 && instantiateDeviceId_ != launch_stream->DeviceId()) {
|
||||
} else if (max_streams_ == 1 && instantiateDeviceId_ != launch_stream->DeviceId()) {
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
topoOrder_[i]->SetStream(launch_stream);
|
||||
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
@@ -716,9 +720,9 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
|
||||
}
|
||||
} else {
|
||||
// Update streams for the graph execution
|
||||
clonedGraph_->UpdateStreams(launch_stream, parallel_streams_);
|
||||
UpdateStreams(launch_stream, parallel_streams_);
|
||||
// Execute all nodes in the graph
|
||||
if (!clonedGraph_->RunNodes()) {
|
||||
if (!RunNodes()) {
|
||||
LogError("Failed to launch nodes!");
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -744,7 +748,6 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
|
||||
block_command->enqueue();
|
||||
block_command->release();
|
||||
CallbackCommand->release();
|
||||
ResetQueueIndex();
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
@@ -491,6 +491,7 @@ struct Graph {
|
||||
std::unordered_set<GraphNode*> capturedNodes_;
|
||||
bool graphInstantiated_;
|
||||
std::unordered_set<void*> memAllocNodePtrs_;
|
||||
std::unordered_map<Node, Node> clonedNodes_;
|
||||
public:
|
||||
Graph(hip::Device* device, const Graph* original = nullptr)
|
||||
: pOriginalGraph_(original)
|
||||
@@ -636,7 +637,7 @@ struct Graph {
|
||||
|
||||
bool TopologicalOrder(std::vector<Node>& TopoOrder);
|
||||
|
||||
Graph* clone(std::unordered_map<Node, Node>& clonedNodes) const;
|
||||
void clone(Graph* newGraph, bool cloneNodes = false) const;
|
||||
Graph* clone() const;
|
||||
void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) {
|
||||
fout << "subgraph cluster_" << GetID() << " {" << std::endl;
|
||||
@@ -724,14 +725,11 @@ struct Graph {
|
||||
};
|
||||
struct GraphKernelNode;
|
||||
|
||||
struct GraphExec : public amd::ReferenceCountedObject {
|
||||
struct GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
//! Topological order of the graph doesn't include nodes embedded as part of the child graph
|
||||
std::vector<Node> topoOrder_;
|
||||
struct Graph* clonedGraph_;
|
||||
std::vector<hip::Stream*> parallel_streams_;
|
||||
hip::Stream* capture_stream_;
|
||||
uint currentQueueIndex_;
|
||||
std::unordered_map<Node, Node> clonedNodes_;
|
||||
static std::unordered_set<GraphExec*> graphExecSet_;
|
||||
static amd::Monitor graphExecSetLock_;
|
||||
uint64_t flags_ = 0;
|
||||
@@ -741,23 +739,14 @@ struct GraphExec : public amd::ReferenceCountedObject {
|
||||
bool repeatLaunch_ = false;
|
||||
|
||||
public:
|
||||
GraphExec(std::vector<Node>& topoOrder, struct Graph*& clonedGraph,
|
||||
std::unordered_map<Node, Node>& clonedNodes, uint64_t flags = 0)
|
||||
GraphExec(uint64_t flags = 0)
|
||||
: ReferenceCountedObject(),
|
||||
topoOrder_(topoOrder),
|
||||
clonedGraph_(clonedGraph),
|
||||
clonedNodes_(clonedNodes),
|
||||
currentQueueIndex_(0),
|
||||
Graph(hip::getCurrentDevice()),
|
||||
flags_(flags) {
|
||||
amd::ScopedLock lock(graphExecSetLock_);
|
||||
graphExecSet_.insert(this);
|
||||
}
|
||||
|
||||
GraphExec() : ReferenceCountedObject() {
|
||||
amd::ScopedLock lock(graphExecSetLock_);
|
||||
graphExecSet_.insert(this);
|
||||
}
|
||||
|
||||
~GraphExec() {
|
||||
for (auto stream : parallel_streams_) {
|
||||
if (stream != nullptr) {
|
||||
@@ -767,7 +756,6 @@ struct GraphExec : public amd::ReferenceCountedObject {
|
||||
}
|
||||
amd::ScopedLock lock(graphExecSetLock_);
|
||||
graphExecSet_.erase(this);
|
||||
delete clonedGraph_;
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
if (kernArgManager_ != nullptr) {
|
||||
kernArgManager_->release();
|
||||
@@ -793,15 +781,6 @@ struct GraphExec : public amd::ReferenceCountedObject {
|
||||
//! Check executable graphs validity
|
||||
static bool isGraphExecValid(GraphExec* pGraphExec);
|
||||
std::vector<Node>& GetNodes() { return topoOrder_; }
|
||||
|
||||
hip::Stream* GetAvailableStreams() {
|
||||
if (currentQueueIndex_ < parallel_streams_.size()) {
|
||||
return parallel_streams_[currentQueueIndex_++];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void ResetQueueIndex() { currentQueueIndex_ = 0; }
|
||||
uint64_t GetFlags() const { return flags_; }
|
||||
hipError_t Init();
|
||||
hipError_t CreateStreams(uint32_t num_streams);
|
||||
@@ -822,19 +801,19 @@ struct GraphExec : public amd::ReferenceCountedObject {
|
||||
hipError_t AllocKernelArgForGraphNode();
|
||||
void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph);
|
||||
hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
|
||||
bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
|
||||
};
|
||||
|
||||
struct ChildGraphNode : public GraphNode {
|
||||
struct GraphExec graphExec_;
|
||||
struct ChildGraphNode : public GraphNode, public GraphExec {
|
||||
bool graphCaptureStatus_;
|
||||
public:
|
||||
ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle") {
|
||||
graphExec_.clonedGraph_ = g->clone();
|
||||
ChildGraphNode(Graph* g) : GraphNode(hipGraphNodeTypeGraph, "solid", "rectangle"), GraphExec() {
|
||||
g->clone(this);
|
||||
graphCaptureStatus_ = false;
|
||||
}
|
||||
|
||||
ChildGraphNode(const ChildGraphNode& rhs) : GraphNode(rhs) {
|
||||
graphExec_.clonedGraph_ = rhs.graphExec_.clonedGraph_->clone();
|
||||
ChildGraphNode(const ChildGraphNode& rhs) : GraphNode(rhs), GraphExec() {
|
||||
rhs.Graph::clone(this);
|
||||
graphCaptureStatus_ = rhs.graphCaptureStatus_;
|
||||
}
|
||||
|
||||
@@ -842,14 +821,14 @@ struct ChildGraphNode : public GraphNode {
|
||||
return new ChildGraphNode(static_cast<ChildGraphNode const&>(*this));
|
||||
}
|
||||
|
||||
Graph* GetChildGraph() override { return graphExec_.clonedGraph_; }
|
||||
Graph* GetChildGraph() override { return this; }
|
||||
|
||||
void SetGraphCaptureStatus(bool status) { graphCaptureStatus_ = status; }
|
||||
|
||||
bool GetGraphCaptureStatus() { return graphCaptureStatus_; }
|
||||
|
||||
std::vector<Node>& GetChildGraphNodeOrder() {
|
||||
return graphExec_.topoOrder_;
|
||||
return topoOrder_;
|
||||
}
|
||||
|
||||
void SetStream(hip::Stream* stream) override {
|
||||
@@ -857,27 +836,25 @@ struct ChildGraphNode : public GraphNode {
|
||||
}
|
||||
|
||||
bool TopologicalOrder(std::vector<Node>& TopoOrder) override {
|
||||
return graphExec_.clonedGraph_->TopologicalOrder(TopoOrder);
|
||||
return Graph::TopologicalOrder(TopoOrder);
|
||||
}
|
||||
|
||||
bool TopologicalOrder() { return graphExec_.clonedGraph_->TopologicalOrder(graphExec_.topoOrder_); }
|
||||
|
||||
void EnqueueCommands(hip::Stream* stream) override {
|
||||
if (graphCaptureStatus_) {
|
||||
hipError_t status = graphExec_.EnqueueGraphWithSingleList(stream);
|
||||
} else if (graphExec_.clonedGraph_->max_streams_ == 1) {
|
||||
for (int i = 0; i < graphExec_.topoOrder_.size(); i++) {
|
||||
graphExec_.topoOrder_[i]->SetStream(stream_);
|
||||
hipError_t status = EnqueueGraphWithSingleList(stream);
|
||||
} else if (max_streams_ == 1) {
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
topoOrder_[i]->SetStream(stream_);
|
||||
hipError_t status =
|
||||
graphExec_.topoOrder_[i]->CreateCommand(graphExec_.topoOrder_[i]->GetQueue());
|
||||
graphExec_.topoOrder_[i]->EnqueueCommands(stream_);
|
||||
topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
topoOrder_[i]->EnqueueCommands(stream_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t SetParams(const Graph* childGraph) {
|
||||
const std::vector<Node>& newNodes = childGraph->GetNodes();
|
||||
const std::vector<Node>& oldNodes = graphExec_.clonedGraph_->GetNodes();
|
||||
const std::vector<Node>& oldNodes = Graph::GetNodes();
|
||||
for (std::vector<Node>::size_type i = 0; i != newNodes.size(); i++) {
|
||||
hipError_t status = oldNodes[i]->SetParams(newNodes[i]);
|
||||
if (status != hipSuccess) {
|
||||
@@ -889,15 +866,15 @@ struct ChildGraphNode : public GraphNode {
|
||||
|
||||
hipError_t SetParams(GraphNode* node) override {
|
||||
const ChildGraphNode* childGraphNode = static_cast<ChildGraphNode const*>(node);
|
||||
return SetParams(childGraphNode->graphExec_.clonedGraph_);
|
||||
return SetParams((Graph*)this);
|
||||
}
|
||||
|
||||
virtual std::string GetLabel(hipGraphDebugDotFlags flag) override {
|
||||
return std::to_string(GetID()) + "\n" + "graph_" + std::to_string(graphExec_.clonedGraph_->GetID());
|
||||
return std::to_string(GraphNode::GetID()) + "\n" + "graph_" + std::to_string(Graph::GetID());
|
||||
}
|
||||
|
||||
virtual void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) override {
|
||||
graphExec_.clonedGraph_->GenerateDOT(fout, flag);
|
||||
Graph::GenerateDOT(fout, flag);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user