/* Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "hip_graph_internal.hpp" #include #define CASE_STRING(X, C) \ case X: \ case_string = #C; \ break; const char* GetGraphNodeTypeString(uint32_t op) { const char* case_string; switch (static_cast(op)) { CASE_STRING(hipGraphNodeTypeKernel, KernelNode) CASE_STRING(hipGraphNodeTypeMemcpy, MemcpyNode) CASE_STRING(hipGraphNodeTypeMemset, MemsetNode) CASE_STRING(hipGraphNodeTypeHost, HostNode) CASE_STRING(hipGraphNodeTypeGraph, GraphNode) CASE_STRING(hipGraphNodeTypeEmpty, EmptyNode) CASE_STRING(hipGraphNodeTypeWaitEvent, WaitEventNode) CASE_STRING(hipGraphNodeTypeEventRecord, EventRecordNode) CASE_STRING(hipGraphNodeTypeExtSemaphoreSignal, ExtSemaphoreSignalNode) CASE_STRING(hipGraphNodeTypeExtSemaphoreWait, ExtSemaphoreWaitNode) CASE_STRING(hipGraphNodeTypeMemAlloc, MemAllocNode) CASE_STRING(hipGraphNodeTypeMemFree, MemFreeNode) CASE_STRING(hipGraphNodeTypeMemcpyFromSymbol, MemcpyFromSymbolNode) CASE_STRING(hipGraphNodeTypeMemcpyToSymbol, MemcpyToSymbolNode) default: case_string = "Unknown node type"; }; return case_string; }; namespace hip { int GraphNode::nextID = 0; int Graph::nextID = 0; std::unordered_set GraphNode::nodeSet_; amd::Monitor GraphNode::nodeSetLock_{"Guards global node set"}; std::unordered_set Graph::graphSet_; amd::Monitor Graph::graphSetLock_{"Guards global graph set"}; std::unordered_set GraphExec::graphExecSet_; amd::Monitor GraphExec::graphExecSetLock_{"Guards global exec graph set"}; std::unordered_set UserObject::ObjectSet_; amd::Monitor UserObject::UserObjectLock_{"Guards global user object"}; hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t count, hipMemcpyKind kind) { hipError_t status = ihipMemcpy_validate(dst, src, count, kind); if (status != hipSuccess) { return status; } size_t sOffsetOrig = 0; amd::Memory* origSrcMemory = getMemoryObject(src, sOffsetOrig); size_t dOffsetOrig = 0; amd::Memory* origDstMemory = getMemoryObject(dst, dOffsetOrig); size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); size_t dOffset = 0; amd::Memory* dstMemory = getMemoryObject(dst, dOffset); if ((srcMemory == nullptr) && (dstMemory != nullptr)) { // host to device if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } if ((kind != hipMemcpyHostToDevice) && (kind != hipMemcpyDefault)) { return hipErrorInvalidValue; } } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { // device to host if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } if ((kind != hipMemcpyDeviceToHost) && (kind != hipMemcpyDefault)) { return hipErrorInvalidValue; } } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } } return hipSuccess; } hipError_t GraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) { hipError_t status; status = ihipMemcpy3D_validate(pNodeParams); if (status != hipSuccess) { return status; } const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pNodeParams); status = ihipDrvMemcpy3D_validate(&pCopy); if (status != hipSuccess) { return status; } return hipSuccess; } bool Graph::isGraphValid(Graph* pGraph) { amd::ScopedLock lock(graphSetLock_); if (graphSet_.find(pGraph) == graphSet_.end()) { return false; } return true; } void Graph::AddNode(const Node& node) { vertices_.emplace_back(node); ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)\n", GetGraphNodeTypeString(node->GetType()), node); node->SetParentGraph(this); } void Graph::RemoveNode(const Node& node) { vertices_.erase(std::remove(vertices_.begin(), vertices_.end(), node), vertices_.end()); delete node; } // root nodes are all vertices with 0 in-degrees std::vector Graph::GetRootNodes() const { std::vector roots; for (auto entry : vertices_) { if (entry->GetInDegree() == 0) { roots.push_back(entry); ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)\n", GetGraphNodeTypeString(entry->GetType()), entry); } } ClPrint(amd::LOG_INFO, amd::LOG_CODE, "\n"); return roots; } // leaf nodes are all vertices with 0 out-degrees std::vector Graph::GetLeafNodes() const { std::vector leafNodes; for (auto entry : vertices_) { if (entry->GetOutDegree() == 0) { leafNodes.push_back(entry); } } return leafNodes; } size_t Graph::GetLeafNodeCount() const { int numLeafNodes = 0; for (auto entry : vertices_) { if (entry->GetOutDegree() == 0) { numLeafNodes++; } } return numLeafNodes; } std::vector> Graph::GetEdges() const { std::vector> edges; for (const auto& i : vertices_) { for (const auto& j : i->GetEdges()) { edges.push_back(std::make_pair(i, j)); } } return edges; } void Graph::GetRunListUtil(Node v, std::unordered_map& visited, std::vector& singleList, std::vector>& parallelLists, std::unordered_map>& dependencies) { // Mark the current node as visited. visited[v] = true; singleList.push_back(v); // Recurse for all the vertices adjacent to this vertex for (auto& adjNode : v->GetEdges()) { if (!visited[adjNode]) { // For the parallel list nodes add parent as the dependency if (singleList.empty()) { ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] For %s(%p)- add parent as dependency %s(%p)\n", GetGraphNodeTypeString(adjNode->GetType()), adjNode, GetGraphNodeTypeString(v->GetType()), v); dependencies[adjNode].push_back(v); } GetRunListUtil(adjNode, visited, singleList, parallelLists, dependencies); } else { for (auto& list : parallelLists) { // Merge singleList when adjNode matches with the first element of the list in existing // lists if (adjNode == list[0]) { for (auto k = singleList.rbegin(); k != singleList.rend(); ++k) { list.insert(list.begin(), *k); } singleList.erase(singleList.begin(), singleList.end()); } } // If the list cannot be merged with the existing list add as dependancy if (!singleList.empty()) { ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] For %s(%p)- add dependency %s(%p)\n", GetGraphNodeTypeString(adjNode->GetType()), adjNode, GetGraphNodeTypeString(v->GetType()), v); dependencies[adjNode].push_back(v); } } } if (!singleList.empty()) { parallelLists.push_back(singleList); singleList.erase(singleList.begin(), singleList.end()); } } // The function to do Topological Sort. // It uses recursive GetRunListUtil() void Graph::GetRunList(std::vector>& parallelLists, std::unordered_map>& dependencies) { std::vector singleList; // Mark all the vertices as not visited std::unordered_map visited; for (auto node : vertices_) visited[node] = false; // Call the recursive helper function for all vertices one by one for (auto node : vertices_) { // If the node has embedded child graph node->GetRunList(parallelLists, dependencies); if (visited[node] == false) { GetRunListUtil(node, visited, singleList, parallelLists, dependencies); } } for (size_t i = 0; i < parallelLists.size(); i++) { for (size_t j = 0; j < parallelLists[i].size(); j++) { ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] List %d - %s(%p)\n", i + 1, GetGraphNodeTypeString(parallelLists[i][j]->GetType()), parallelLists[i][j]); } } } bool Graph::TopologicalOrder(std::vector& TopoOrder) { std::queue q; std::unordered_map inDegree; for (auto entry : vertices_) { if (entry->GetInDegree() == 0) { q.push(entry); } inDegree[entry] = entry->GetInDegree(); } while (!q.empty()) { Node node = q.front(); TopoOrder.push_back(node); q.pop(); for (auto edge : node->GetEdges()) { inDegree[edge]--; if (inDegree[edge] == 0) { q.push(edge); } } } if (GetNodeCount() == TopoOrder.size()) { return true; } return false; } Graph* Graph::clone(std::unordered_map& clonedNodes) const { Graph* newGraph = new Graph(device_, this); for (auto entry : vertices_) { GraphNode* node = entry->clone(); node->SetParentGraph(newGraph); newGraph->vertices_.push_back(node); clonedNodes[entry] = node; } std::vector clonedEdges; std::vector clonedDependencies; for (auto node : vertices_) { const std::vector& edges = node->GetEdges(); clonedEdges.clear(); for (auto edge : edges) { clonedEdges.push_back(clonedNodes[edge]); } clonedNodes[node]->SetEdges(clonedEdges); } for (auto node : vertices_) { const std::vector& dependencies = node->GetDependencies(); clonedDependencies.clear(); for (auto dep : dependencies) { clonedDependencies.push_back(clonedNodes[dep]); } clonedNodes[node]->SetDependencies(clonedDependencies); } return newGraph; } Graph* Graph::clone() const { std::unordered_map clonedNodes; return clone(clonedNodes); } bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) { amd::ScopedLock lock(graphExecSetLock_); if (graphExecSet_.find(pGraphExec) == graphExecSet_.end()) { return false; } return true; } hipError_t GraphExec::CreateStreams(uint32_t num_streams) { parallel_streams_.reserve(num_streams); for (uint32_t i = 0; i < num_streams; ++i) { auto stream = new hip::Stream(hip::getCurrentDevice(), hip::Stream::Priority::Normal, hipStreamNonBlocking); if (stream == nullptr || !stream->Create()) { if (stream != nullptr) { hip::Stream::Destroy(stream); } ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!\n"); return hipErrorOutOfMemory; } parallel_streams_.push_back(stream); } return hipSuccess; } hipError_t GraphExec::Init() { hipError_t status = hipSuccess; size_t min_num_streams = 1; for (auto& node : topoOrder_) { status = node->GetNumParallelStreams(min_num_streams); if(status != hipSuccess) { return status; } } status = CreateStreams(parallelLists_.size() - 1 + min_num_streams); return status; } hipError_t GraphExec::CaptureAQLPackets() { hipError_t status = hipSuccess; if (parallelLists_.size() == 1) { size_t kernArgSizeForGraph = 0; hip::Stream* stream = nullptr; // GPU packet capture is enabled for kernel nodes. Calculate the kernel // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists_) { stream = GetAvailableStreams(); for (auto& node : list) { node->SetStream(stream, this); if (node->GetType() == hipGraphNodeTypeKernel) { kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); } } } auto device = g_devices[ihipGetDevice()]->devices()[0]; if (device->info().largeBar_) { // Pad kernel argument buffer with sentinal size bytes to do a readback later kernArgSizeForGraph += sizeof(int); kernarg_pool_graph_ = reinterpret_cast
(device->deviceLocalAlloc(kernArgSizeForGraph)); device_kernarg_pool_ = true; } else { kernarg_pool_graph_ = reinterpret_cast
( device->hostAlloc(kernArgSizeForGraph, 0, amd::Device::MemorySegment::kKernArg)); } if (kernarg_pool_graph_ == nullptr) { return hipErrorMemoryAllocation; } kernarg_pool_size_graph_ = kernArgSizeForGraph; for (auto& node : topoOrder_) { if (node->GetType() == hipGraphNodeTypeKernel) { auto kernelNode = reinterpret_cast(node); status = node->CreateCommand(node->GetQueue()); // From the kernel pool allocate the kern arg size required for the current kernel node. address kernArgOffset = allocKernArg(kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment()); if (kernArgOffset == nullptr) { return hipErrorMemoryAllocation; } // Form GPU packet capture for the kernel node. kernelNode->CaptureAndFormPacket(kernArgOffset); } } if (device_kernarg_pool_) { // Write HDP_MEM_COHERENCY_FLUSH_CNTL reg to initiate flush read to HDP mem. Verify mem // by readback of sentinal value at the tail end of the kernarg surface (allocated above) // This needs to be done for PCIE connected devices only. HDP path is disabled for XGMI // between CPU<->GPU if (!device->isXgmi()) { static int host_val = 1; address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int); *dev_ptr = host_val; if (device->info().hdpMemFlushCntl == nullptr) { amd::Command* command = new amd::Marker(*stream, true); if (command != nullptr) { command->enqueue(); command->release(); } } else { *device->info().hdpMemFlushCntl = 1; } while (*dev_ptr != host_val); host_val++; } } ResetQueueIndex(); } return status; } hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& topoOrder, Graph* clonedGraph, amd::Command*& graphStart, amd::Command*& graphEnd, hip::Stream* stream) { hipError_t status = hipSuccess; for (auto& node : topoOrder) { // TODO: clone commands from next launch status = node->CreateCommand(node->GetQueue()); if (status != hipSuccess) return status; amd::Command::EventWaitList waitList; for (auto depNode : nodeWaitLists[node]) { for (auto command : depNode->GetCommands()) { waitList.push_back(command); } } node->UpdateEventWaitLists(waitList); } std::vector rootNodes = clonedGraph->GetRootNodes(); ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] RootCommand get launched on stream (stream:%p)\n", stream); for (auto& root : rootNodes) { //If rootnode is launched on to the same stream dont add dependency if (root->GetQueue() != stream) { if (graphStart == nullptr) { graphStart = new amd::Marker(*stream, false, {}); if (graphStart == nullptr) { return hipErrorOutOfMemory; } } amd::Command::EventWaitList waitList; waitList.push_back(graphStart); auto commands = root->GetCommands(); if (!commands.empty()) { commands[0]->updateEventWaitList(waitList); } } } // graphEnd ensures next enqueued ones start after graph is finished (all parallel branches) amd::Command::EventWaitList graphLastCmdWaitList; std::vector leafNodes = clonedGraph->GetLeafNodes(); for (auto& leaf : leafNodes) { // If leaf node is launched on to the same stream dont add dependency if (leaf->GetQueue() != stream) { amd::Command::EventWaitList waitList; waitList.push_back(graphEnd); auto commands = leaf->GetCommands(); if (!commands.empty()) { graphLastCmdWaitList.push_back(commands.back()); } } } if (!graphLastCmdWaitList.empty()) { graphEnd = new amd::Marker(*stream, false, graphLastCmdWaitList); ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] EndCommand will get launched on stream (stream:%p)\n", stream); if (graphEnd == nullptr) { graphStart->release(); return hipErrorOutOfMemory; } } return hipSuccess; } void UpdateStream(std::vector>& parallelLists, hip::Stream* stream, GraphExec* ptr) { int i = 0; for (const auto& list : parallelLists) { // first parallel list will be launched on the same queue as parent if (i == 0) { for (auto& node : list) { node->SetStream(stream, ptr); } } else { // New stream for parallel branches hip::Stream* stream = ptr->GetAvailableStreams(); for (auto& node : list) { node->SetStream(stream, ptr); } } i++; } } hipError_t GraphExec::Run(hipStream_t stream) { hipError_t status = hipSuccess; if (hip::getStream(stream) == nullptr) { return hipErrorInvalidResourceHandle; } auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); if (flags_ & hipGraphInstantiateFlagAutoFreeOnLaunch) { if (!topoOrder_.empty()) { topoOrder_[0]->GetParentGraph()->FreeAllMemory(hip_stream); } } // If this is a repeat launch, make sure corresponding MemFreeNode exists for a MemAlloc node if (repeatLaunch_ == true) { for (auto& node : topoOrder_) { if (node->GetType() == hipGraphNodeTypeMemAlloc && static_cast(node)->IsActiveMem() == true) { return hipErrorInvalidValue; } } } else { repeatLaunch_ = true; } if (parallelLists_.size() == 1) { amd::AccumulateCommand* accumulate = nullptr; bool isLastPacketKernel = false; if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { uint8_t* lastCapturedPacket = (topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) ? topoOrder_.back()->GetAqlPacket() : nullptr; accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket); } for (int i = 0; i < topoOrder_.size() - 1; i++) { if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) { hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate); accumulate->addKernelName(topoOrder_[i]->GetKernelName()); } else { topoOrder_[i]->SetStream(hip_stream, this); status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue()); topoOrder_[i]->EnqueueCommands(stream); } } // If last captured packet is kernel, optimize to detect completion of last kernel // This saves on extra packet submitted to determine end of graph if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) { // Add the last kernel node name to the accumulate command accumulate->addKernelName(topoOrder_.back()->GetKernelName()); accumulate->enqueue(); accumulate->release(); isLastPacketKernel = true; } else { topoOrder_.back()->SetStream(hip_stream, this); status = topoOrder_.back()->CreateCommand(topoOrder_.back()->GetQueue()); topoOrder_.back()->EnqueueCommands(stream); } // If last packet is not kernel, submit a marker to detect end of graph if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && !isLastPacketKernel) { accumulate->enqueue(); accumulate->release(); } } else { UpdateStream(parallelLists_, hip_stream, this); amd::Command* rootCommand = nullptr; amd::Command* endCommand = nullptr; status = FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, clonedGraph_, rootCommand, endCommand, hip_stream); if (status != hipSuccess) { return status; } if (rootCommand != nullptr) { rootCommand->enqueue(); rootCommand->release(); } for (int i = 0; i < topoOrder_.size(); i++) { topoOrder_[i]->EnqueueCommands(stream); } if (endCommand != nullptr) { endCommand->enqueue(); endCommand->release(); } } ResetQueueIndex(); return status; } } // namespace hip