Files
rocm-systems/hipamd/src/hip_graph_internal.cpp
T
Saleel Kudchadker e1e5d071ba SWDEV-301667 - Port optimization to save extra packet to graphs
Change-Id: Ibaf64a4efe070c42620e6e153c1862a4a0b15664
2023-08-23 16:58:21 -04:00

640 wiersze
24 KiB
C++

/* Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "hip_graph_internal.hpp"
#include <queue>
#define CASE_STRING(X, C) \
case X: \
case_string = #C; \
break;
const char* GetGraphNodeTypeString(uint32_t op) {
const char* case_string;
switch (static_cast<hipGraphNodeType>(op)) {
CASE_STRING(hipGraphNodeTypeKernel, KernelNode)
CASE_STRING(hipGraphNodeTypeMemcpy, MemcpyNode)
CASE_STRING(hipGraphNodeTypeMemset, MemsetNode)
CASE_STRING(hipGraphNodeTypeHost, HostNode)
CASE_STRING(hipGraphNodeTypeGraph, GraphNode)
CASE_STRING(hipGraphNodeTypeEmpty, EmptyNode)
CASE_STRING(hipGraphNodeTypeWaitEvent, WaitEventNode)
CASE_STRING(hipGraphNodeTypeEventRecord, EventRecordNode)
CASE_STRING(hipGraphNodeTypeExtSemaphoreSignal, ExtSemaphoreSignalNode)
CASE_STRING(hipGraphNodeTypeExtSemaphoreWait, ExtSemaphoreWaitNode)
CASE_STRING(hipGraphNodeTypeMemAlloc, MemAllocNode)
CASE_STRING(hipGraphNodeTypeMemFree, MemFreeNode)
CASE_STRING(hipGraphNodeTypeMemcpyFromSymbol, MemcpyFromSymbolNode)
CASE_STRING(hipGraphNodeTypeMemcpyToSymbol, MemcpyToSymbolNode)
default:
case_string = "Unknown node type";
};
return case_string;
};
namespace hip {
int GraphNode::nextID = 0;
int Graph::nextID = 0;
std::unordered_set<GraphNode*> GraphNode::nodeSet_;
amd::Monitor GraphNode::nodeSetLock_{"Guards global node set"};
std::unordered_set<Graph*> Graph::graphSet_;
amd::Monitor Graph::graphSetLock_{"Guards global graph set"};
std::unordered_set<GraphExec*> GraphExec::graphExecSet_;
amd::Monitor GraphExec::graphExecSetLock_{"Guards global exec graph set"};
std::unordered_set<UserObject*> UserObject::ObjectSet_;
amd::Monitor UserObject::UserObjectLock_{"Guards global user object"};
hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t count,
hipMemcpyKind kind) {
hipError_t status = ihipMemcpy_validate(dst, src, count, kind);
if (status != hipSuccess) {
return status;
}
size_t sOffsetOrig = 0;
amd::Memory* origSrcMemory = getMemoryObject(src, sOffsetOrig);
size_t dOffsetOrig = 0;
amd::Memory* origDstMemory = getMemoryObject(dst, dOffsetOrig);
size_t sOffset = 0;
amd::Memory* srcMemory = getMemoryObject(src, sOffset);
size_t dOffset = 0;
amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
if ((srcMemory == nullptr) && (dstMemory != nullptr)) { // host to device
if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
return hipErrorInvalidValue;
}
if ((kind != hipMemcpyHostToDevice) && (kind != hipMemcpyDefault)) {
return hipErrorInvalidValue;
}
} else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { // device to host
if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) {
return hipErrorInvalidValue;
}
if ((kind != hipMemcpyDeviceToHost) && (kind != hipMemcpyDefault)) {
return hipErrorInvalidValue;
}
} else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
return hipErrorInvalidValue;
}
if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) {
return hipErrorInvalidValue;
}
}
return hipSuccess;
}
hipError_t GraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) {
hipError_t status = ihipMemcpy3D_validate(pNodeParams);
if (status != hipSuccess) {
return status;
}
size_t offset = 0;
const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pNodeParams);
// If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the
// (unified virtual address space) base address of the source data and the bytes per row to apply.
// {src/dst}Array is ignored.
hipMemoryType srcMemoryType = pCopy.srcMemoryType;
if (srcMemoryType == hipMemoryTypeUnified) {
amd::Memory* memObj = getMemoryObject(pCopy.srcDevice, offset);
srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice;
if (srcMemoryType == hipMemoryTypeHost) {
// {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system
// memory.
const_cast<HIP_MEMCPY3D*>(&pCopy)->srcHost = pCopy.srcDevice;
const_cast<HIP_MEMCPY3D*>(&pCopy)->srcXInBytes += offset;
}
}
offset = 0;
hipMemoryType dstMemoryType = pCopy.dstMemoryType;
if (dstMemoryType == hipMemoryTypeUnified) {
amd::Memory* memObj = getMemoryObject(pCopy.dstDevice, offset);
dstMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
memObj->getMemFlags()) ? hipMemoryTypeHost : hipMemoryTypeDevice;
if (dstMemoryType == hipMemoryTypeHost) {
const_cast<HIP_MEMCPY3D*>(&pCopy)->dstHost = pCopy.dstDevice;
const_cast<HIP_MEMCPY3D*>(&pCopy)->dstXInBytes += offset;
}
}
offset = 0;
// If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned.
// In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning.
if (srcMemoryType == hipMemoryTypeHost) {
srcMemoryType = getMemoryObject(pCopy.srcHost, offset) ? hipMemoryTypeDevice :
hipMemoryTypeHost;
if (srcMemoryType == hipMemoryTypeDevice) {
const_cast<HIP_MEMCPY3D*>(&pCopy)->srcDevice = const_cast<void*>(pCopy.srcHost);
}
}
offset = 0;
if (dstMemoryType == hipMemoryTypeHost) {
dstMemoryType = getMemoryObject(pCopy.dstHost, offset) ? hipMemoryTypeDevice :
hipMemoryTypeHost;
if (dstMemoryType == hipMemoryTypeDevice) {
const_cast<HIP_MEMCPY3D*>(&pCopy)->dstDevice = const_cast<void*>(pCopy.dstDevice);
}
}
amd::Coord3D srcOrigin = {pCopy.srcXInBytes, pCopy.srcY, pCopy.srcZ};
amd::Coord3D dstOrigin = {pCopy.dstXInBytes, pCopy.dstY, pCopy.dstZ};
amd::Coord3D copyRegion = {pCopy.WidthInBytes, pCopy.Height, pCopy.Depth};
if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeDevice)) {
// Host to Device.
amd::Memory* dstMemory;
amd::BufferRect srcRect;
amd::BufferRect dstRect;
status =
ihipMemcpyHtoDValidate(pCopy.srcHost, pCopy.dstDevice, srcOrigin, dstOrigin, copyRegion,
pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch,
pCopy.dstPitch * pCopy.dstHeight, dstMemory, srcRect, dstRect);
if (status != hipSuccess) {
return status;
}
} else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) {
// Device to Host.
amd::Memory* srcMemory;
amd::BufferRect srcRect;
amd::BufferRect dstRect;
status =
ihipMemcpyDtoHValidate(pCopy.srcDevice, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion,
pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch,
pCopy.dstPitch * pCopy.dstHeight, srcMemory, srcRect, dstRect);
if (status != hipSuccess) {
return status;
}
} else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) {
// Device to Device.
amd::Memory* srcMemory;
amd::Memory* dstMemory;
amd::BufferRect srcRect;
amd::BufferRect dstRect;
status = ihipMemcpyDtoDValidate(pCopy.srcDevice, pCopy.dstDevice, srcOrigin, dstOrigin,
copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight,
pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcMemory,
dstMemory, srcRect, dstRect);
if (status != hipSuccess) {
return status;
}
} else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) {
amd::Image* dstImage;
amd::BufferRect srcRect;
status =
ihipMemcpyHtoAValidate(pCopy.srcHost, pCopy.dstArray, srcOrigin, dstOrigin, copyRegion,
pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, dstImage, srcRect);
if (status != hipSuccess) {
return status;
}
} else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) {
// Image to Host.
amd::Image* srcImage;
amd::BufferRect dstRect;
status =
ihipMemcpyAtoHValidate(pCopy.srcArray, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion,
pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcImage, dstRect);
if (status != hipSuccess) {
return status;
}
} else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) {
// Device to Image.
amd::Image* dstImage;
amd::Memory* srcMemory;
amd::BufferRect dstRect;
amd::BufferRect srcRect;
status = ihipMemcpyDtoAValidate(pCopy.srcDevice, pCopy.dstArray, srcOrigin, dstOrigin,
copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight,
dstImage, srcMemory, dstRect, srcRect);
if (status != hipSuccess) {
return status;
}
} else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) {
// Image to Device.
amd::BufferRect srcRect;
amd::BufferRect dstRect;
amd::Memory* dstMemory;
amd::Image* srcImage;
status = ihipMemcpyAtoDValidate(pCopy.srcArray, pCopy.dstDevice, srcOrigin, dstOrigin,
copyRegion, pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight,
dstMemory, srcImage, srcRect, dstRect);
if (status != hipSuccess) {
return status;
}
} else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) {
amd::Image* srcImage;
amd::Image* dstImage;
status = ihipMemcpyAtoAValidate(pCopy.srcArray, pCopy.dstArray, srcOrigin, dstOrigin,
copyRegion, srcImage, dstImage);
if (status != hipSuccess) {
return status;
}
} else {
return hipErrorInvalidValue;
}
return hipSuccess;
}
bool Graph::isGraphValid(Graph* pGraph) {
amd::ScopedLock lock(graphSetLock_);
if (graphSet_.find(pGraph) == graphSet_.end()) {
return false;
}
return true;
}
void Graph::AddNode(const Node& node) {
vertices_.emplace_back(node);
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)\n",
GetGraphNodeTypeString(node->GetType()), node);
node->SetParentGraph(this);
}
void Graph::RemoveNode(const Node& node) {
vertices_.erase(std::remove(vertices_.begin(), vertices_.end(), node), vertices_.end());
delete node;
}
// root nodes are all vertices with 0 in-degrees
std::vector<Node> Graph::GetRootNodes() const {
std::vector<Node> roots;
for (auto entry : vertices_) {
if (entry->GetInDegree() == 0) {
roots.push_back(entry);
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)\n",
GetGraphNodeTypeString(entry->GetType()), entry);
}
}
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "\n");
return roots;
}
// leaf nodes are all vertices with 0 out-degrees
std::vector<Node> Graph::GetLeafNodes() const {
std::vector<Node> leafNodes;
for (auto entry : vertices_) {
if (entry->GetOutDegree() == 0) {
leafNodes.push_back(entry);
}
}
return leafNodes;
}
size_t Graph::GetLeafNodeCount() const {
int numLeafNodes = 0;
for (auto entry : vertices_) {
if (entry->GetOutDegree() == 0) {
numLeafNodes++;
}
}
return numLeafNodes;
}
std::vector<std::pair<Node, Node>> Graph::GetEdges() const {
std::vector<std::pair<Node, Node>> edges;
for (const auto& i : vertices_) {
for (const auto& j : i->GetEdges()) {
edges.push_back(std::make_pair(i, j));
}
}
return edges;
}
void Graph::GetRunListUtil(Node v, std::unordered_map<Node, bool>& visited,
std::vector<Node>& singleList,
std::vector<std::vector<Node>>& parallelLists,
std::unordered_map<Node, std::vector<Node>>& dependencies) {
// Mark the current node as visited.
visited[v] = true;
singleList.push_back(v);
// Recurse for all the vertices adjacent to this vertex
for (auto& adjNode : v->GetEdges()) {
if (!visited[adjNode]) {
// For the parallel list nodes add parent as the dependency
if (singleList.empty()) {
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
"[hipGraph] For %s(%p)- add parent as dependency %s(%p)\n",
GetGraphNodeTypeString(adjNode->GetType()), adjNode,
GetGraphNodeTypeString(v->GetType()), v);
dependencies[adjNode].push_back(v);
}
GetRunListUtil(adjNode, visited, singleList, parallelLists, dependencies);
} else {
for (auto& list : parallelLists) {
// Merge singleList when adjNode matches with the first element of the list in existing
// lists
if (adjNode == list[0]) {
for (auto k = singleList.rbegin(); k != singleList.rend(); ++k) {
list.insert(list.begin(), *k);
}
singleList.erase(singleList.begin(), singleList.end());
}
}
// If the list cannot be merged with the existing list add as dependancy
if (!singleList.empty()) {
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] For %s(%p)- add dependency %s(%p)\n",
GetGraphNodeTypeString(adjNode->GetType()), adjNode,
GetGraphNodeTypeString(v->GetType()), v);
dependencies[adjNode].push_back(v);
}
}
}
if (!singleList.empty()) {
parallelLists.push_back(singleList);
singleList.erase(singleList.begin(), singleList.end());
}
}
// The function to do Topological Sort.
// It uses recursive GetRunListUtil()
void Graph::GetRunList(std::vector<std::vector<Node>>& parallelLists,
std::unordered_map<Node, std::vector<Node>>& dependencies) {
std::vector<Node> singleList;
// Mark all the vertices as not visited
std::unordered_map<Node, bool> visited;
for (auto node : vertices_) visited[node] = false;
// Call the recursive helper function for all vertices one by one
for (auto node : vertices_) {
// If the node has embedded child graph
node->GetRunList(parallelLists, dependencies);
if (visited[node] == false) {
GetRunListUtil(node, visited, singleList, parallelLists, dependencies);
}
}
for (size_t i = 0; i < parallelLists.size(); i++) {
for (size_t j = 0; j < parallelLists[i].size(); j++) {
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] List %d - %s(%p)\n", i + 1,
GetGraphNodeTypeString(parallelLists[i][j]->GetType()), parallelLists[i][j]);
}
}
}
bool Graph::TopologicalOrder(std::vector<Node>& TopoOrder) {
std::queue<Node> q;
std::unordered_map<Node, int> inDegree;
for (auto entry : vertices_) {
if (entry->GetInDegree() == 0) {
q.push(entry);
}
inDegree[entry] = entry->GetInDegree();
}
while (!q.empty())
{
Node node = q.front();
TopoOrder.push_back(node);
q.pop();
for (auto edge : node->GetEdges()) {
inDegree[edge]--;
if (inDegree[edge] == 0) {
q.push(edge);
}
}
}
if (GetNodeCount() == TopoOrder.size()) {
return true;
}
return false;
}
Graph* Graph::clone(std::unordered_map<Node, Node>& clonedNodes) const {
Graph* newGraph = new Graph(device_, this);
for (auto entry : vertices_) {
GraphNode* node = entry->clone();
node->SetParentGraph(newGraph);
newGraph->vertices_.push_back(node);
clonedNodes[entry] = node;
}
std::vector<Node> clonedEdges;
std::vector<Node> clonedDependencies;
for (auto node : vertices_) {
const std::vector<Node>& edges = node->GetEdges();
clonedEdges.clear();
for (auto edge : edges) {
clonedEdges.push_back(clonedNodes[edge]);
}
clonedNodes[node]->SetEdges(clonedEdges);
}
for (auto node : vertices_) {
const std::vector<Node>& dependencies = node->GetDependencies();
clonedDependencies.clear();
for (auto dep : dependencies) {
clonedDependencies.push_back(clonedNodes[dep]);
}
clonedNodes[node]->SetDependencies(clonedDependencies);
}
return newGraph;
}
Graph* Graph::clone() const {
std::unordered_map<Node, Node> clonedNodes;
return clone(clonedNodes);
}
bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) {
amd::ScopedLock lock(graphExecSetLock_);
if (graphExecSet_.find(pGraphExec) == graphExecSet_.end()) {
return false;
}
return true;
}
hipError_t GraphExec::CreateStreams(uint32_t num_streams) {
parallel_streams_.reserve(num_streams);
for (uint32_t i = 0; i < num_streams; ++i) {
auto stream = new hip::Stream(hip::getCurrentDevice(),
hip::Stream::Priority::Normal, hipStreamNonBlocking);
if (stream == nullptr || !stream->Create()) {
if (stream != nullptr) {
hip::Stream::Destroy(stream);
}
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!\n");
return hipErrorOutOfMemory;
}
parallel_streams_.push_back(stream);
}
return hipSuccess;
}
hipError_t GraphExec::Init() {
hipError_t status;
size_t min_num_streams = 1;
for (auto& node : topoOrder_) {
status = node->GetNumParallelStreams(min_num_streams);
if(status != hipSuccess) {
return status;
}
}
status = CreateStreams(parallelLists_.size() - 1 + min_num_streams);
return status;
}
hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
std::vector<Node>& topoOrder, Graph* clonedGraph,
amd::Command*& graphStart, amd::Command*& graphEnd, hip::Stream* stream) {
hipError_t status;
for (auto& node : topoOrder) {
// TODO: clone commands from next launch
status = node->CreateCommand(node->GetQueue());
if (status != hipSuccess) return status;
amd::Command::EventWaitList waitList;
for (auto depNode : nodeWaitLists[node]) {
for (auto command : depNode->GetCommands()) {
waitList.push_back(command);
}
}
node->UpdateEventWaitLists(waitList);
}
std::vector<Node> rootNodes = clonedGraph->GetRootNodes();
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
"[hipGraph] RootCommand get launched on stream (stream:%p)\n", stream);
for (auto& root : rootNodes) {
//If rootnode is launched on to the same stream dont add dependency
if (root->GetQueue() != stream) {
if (graphStart == nullptr) {
graphStart = new amd::Marker(*stream, false, {});
if (graphStart == nullptr) {
return hipErrorOutOfMemory;
}
}
amd::Command::EventWaitList waitList;
waitList.push_back(graphStart);
auto commands = root->GetCommands();
if (!commands.empty()) {
commands[0]->updateEventWaitList(waitList);
}
}
}
// graphEnd ensures next enqueued ones start after graph is finished (all parallel branches)
amd::Command::EventWaitList graphLastCmdWaitList;
std::vector<Node> leafNodes = clonedGraph->GetLeafNodes();
for (auto& leaf : leafNodes) {
// If leaf node is launched on to the same stream dont add dependency
if (leaf->GetQueue() != stream) {
amd::Command::EventWaitList waitList;
waitList.push_back(graphEnd);
auto commands = leaf->GetCommands();
if (!commands.empty()) {
graphLastCmdWaitList.push_back(commands.back());
}
}
}
if (!graphLastCmdWaitList.empty()) {
graphEnd = new amd::Marker(*stream, false, graphLastCmdWaitList);
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
"[hipGraph] EndCommand will get launched on stream (stream:%p)\n", stream);
if (graphEnd == nullptr) {
graphStart->release();
return hipErrorOutOfMemory;
}
}
return hipSuccess;
}
void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* stream,
GraphExec* ptr) {
int i = 0;
for (const auto& list : parallelLists) {
// first parallel list will be launched on the same queue as parent
if (i == 0) {
for (auto& node : list) {
node->SetStream(stream, ptr);
}
} else { // New stream for parallel branches
hip::Stream* stream = ptr->GetAvailableStreams();
for (auto& node : list) {
node->SetStream(stream, ptr);
}
}
i++;
}
}
hipError_t GraphExec::Run(hipStream_t stream) {
hipError_t status;
if (hip::getStream(stream) == nullptr) {
return hipErrorInvalidResourceHandle;
}
auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream()
: reinterpret_cast<hip::Stream*>(stream);
if (flags_ & hipGraphInstantiateFlagAutoFreeOnLaunch) {
if (!topoOrder_.empty()) {
topoOrder_[0]->GetParentGraph()->FreeAllMemory(hip_stream);
}
}
// If this is a repeat launch, make sure corresponding MemFreeNode exists for a MemAlloc node
if (repeatLaunch_ == true) {
for (auto& node : topoOrder_) {
if (node->GetType() == hipGraphNodeTypeMemAlloc &&
static_cast<GraphMemAllocNode*>(node)->IsActiveMem() == true) {
return hipErrorInvalidValue;
}
}
} else {
repeatLaunch_ = true;
}
UpdateStream(parallelLists_, hip_stream, this);
amd::Command* rootCommand = nullptr;
amd::Command* endCommand = nullptr;
status = FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, clonedGraph_, rootCommand,
endCommand, hip_stream);
if (status != hipSuccess) {
return status;
}
if (rootCommand != nullptr) {
rootCommand->enqueue();
rootCommand->release();
}
for (int i = 0; i < topoOrder_.size(); i++) {
if (DEBUG_CLR_GRAPH_ENABLE_BUFFERING) {
// Enable buffering for graph with single branch
if (parallelLists_.size() == 1) {
// Peep through the next node. If current and next node are kernel then enable AQL
// buffering
if (((i + 1) != topoOrder_.size()) && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
topoOrder_[i + 1]->GetType() == hipGraphNodeTypeKernel) {
topoOrder_[i]->EnableBuffering();
}
}
}
topoOrder_[i]->EnqueueCommands(stream);
}
if (endCommand != nullptr) {
endCommand->enqueue();
endCommand->release();
}
ResetQueueIndex();
return status;
}
} // namespace hip