diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index bf1ee627f9..3d44b10b67 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -362,9 +362,9 @@ void GetKernelArgSizeForGraph(std::vector>& parallelLists, // GPU packet capture is enabled for kernel nodes. Calculate the kernel // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists) { - for (auto& node : list) { - if (node->GetType() == hipGraphNodeTypeKernel) { - kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); + for (hip::GraphNode* node : list) { + if (node->GraphCaptureEnabled()) { + kernArgSizeForGraph += node->GetKerArgSize(); } else if (node->GetType() == hipGraphNodeTypeGraph) { auto& childParallelLists = reinterpret_cast(node)->GetParallelLists(); @@ -388,18 +388,19 @@ hipError_t AllocKernelArgForGraphNode(std::vector& topoOrder, graphExec->SetHiddenHeap(); initialized = true; } - auto kernelNode = reinterpret_cast(node); - // From the kernel pool allocate the kern arg size required for the current kernel node. + } + if (node->GraphCaptureEnabled()) { + // From the kernel pool allocate the kern arg size required for the current node. address kernArgOffset = nullptr; - if (kernelNode->GetKernargSegmentByteSize()) { + if (node->GetKernargSegmentByteSize()) { kernArgOffset = graphExec->kernArgManager_->AllocKernArg( - kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment()); + node->GetKernargSegmentByteSize(), node->GetKernargSegmentAlignment()); if (kernArgOffset == nullptr) { return hipErrorMemoryAllocation; } } // Form GPU packet capture for the kernel node. - kernelNode->CaptureAndFormPacket(capture_stream, kernArgOffset); + node->CaptureAndFormPacket(capture_stream, kernArgOffset); } else if (node->GetType() == hipGraphNodeTypeGraph) { auto childNode = reinterpret_cast(node); auto& childParallelLists = childNode->GetParallelLists(); @@ -551,7 +552,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr); } for (int i = 0; i < topoOrder.size(); i++) { - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) { + if (topoOrder[i]->GraphCaptureEnabled()) { if (topoOrder[i]->GetEnabled()) { hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(), topoOrder[i]->GetKernelName(), diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index 20499a202b..171bfb24da 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -186,6 +186,9 @@ struct GraphNode : public hipGraphNodeDOTAttribute { unsigned int isEnabled_; uint8_t gpuPacket_[64]; //!< GPU Packet to enqueue during graph launch std::string capturedKernelName_; + size_t alignedKernArgSize_ = 256; //!< Aligned size required for kernel args + size_t kernargSegmentByteSize_ = 256; //!< Kernel arg segment byte size + size_t kernargSegmentAlignment_ = 256; //!< Kernel arg segment alignment public: GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "", @@ -237,7 +240,19 @@ struct GraphNode : public hipGraphNodeDOTAttribute { uint8_t* GetAqlPacket() { return gpuPacket_; } void SetKernelName(const std::string& kernelName) { capturedKernelName_ = kernelName; } const std::string& GetKernelName() const { return capturedKernelName_; } - + size_t GetKerArgSize() const { return alignedKernArgSize_; } + size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; } + size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; } + void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) { + hipError_t status = CreateCommand(capture_stream); + for (auto& command : commands_) { + command->setCapturingState(true, GetAqlPacket(), kernArgOffset, &capturedKernelName_); + // Enqueue command to capture GPU Packet. The packet is not submitted to the device. + // The packet is stored in gpuPacket_ and submitted during graph launch. + command->submit(*(command->queue())->vdev()); + command->release(); + } + } hip::Stream* GetQueue() const { return stream_; } virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) { @@ -380,6 +395,20 @@ struct GraphNode : public hipGraphNodeDOTAttribute { } unsigned int GetEnabled() const { return isEnabled_; } void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; } + // Returns true if capture is enabled for the current node. + bool GraphCaptureEnabled() { + bool isGraphCapture = false; + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { + switch (GetType()) { + case hipGraphNodeTypeKernel: + isGraphCapture = true; + break; + default: + break; + } + } + return isGraphCapture; + } }; struct Graph { @@ -835,16 +864,9 @@ class GraphKernelNode : public GraphNode { hipKernelNodeAttrValue kernelAttr_; //!< Kernel node attributes unsigned int kernelAttrInUse_; //!< Kernel attributes in use ihipExtKernelEvents kernelEvents_; //!< Events for Ext launch kernel - size_t alignedKernArgSize_; //!< Aligned size required for kernel args - size_t kernargSegmentByteSize_; //!< Kernel arg segment byte size - size_t kernargSegmentAlignment_; //!< Kernel arg segment alignment bool hasHiddenHeap_; //!< Kernel has hidden heap(device side allocation) - public: - size_t GetKerArgSize() const { return alignedKernArgSize_; } - size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; } - size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; } bool HasHiddenHeap() const { return hasHiddenHeap_; } void EnqueueCommands(hipStream_t stream) override { // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker. @@ -888,21 +910,6 @@ class GraphKernelNode : public GraphNode { out << "];"; } - void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) { - hipError_t status = CreateCommand(capture_stream); - for (auto& command : commands_) { - reinterpret_cast(command)->setCapturingState( - true, GetAqlPacket(), kernArgOffset); - - // Enqueue command to capture GPU Packet. The packet is not submitted to the device. - // The packet is stored in gpuPacket_ and submitted during graph launch. - command->submit(*(command->queue())->vdev()); - // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes - SetKernelName(reinterpret_cast(command)->kernel().name()); - command->release(); - } - } - virtual std::string GetLabel(hipGraphDebugDotFlags flag) override { hipFunction_t func = getFunc(kernelParams_, ihipGetDevice()); hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func); diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index fa822ac3c1..3b10056c8a 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -1497,6 +1497,9 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { } } } + if (command.getCapturingState()) { + currCmd_ = &command; + } } // ================================================================================================ @@ -1514,6 +1517,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) { if (AMD_DIRECT_DISPATCH) { assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty()); } + currCmd_ = nullptr; } // ================================================================================================ @@ -3018,7 +3022,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, amd::Memory* const* memories = reinterpret_cast(parameters + kernelParams.memoryObjOffset()); - bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); + bool isGraphCapture = currCmd_ != nullptr && currCmd_->getCapturingState(); for (int j = 0; j < iteration; j++) { // Reset global size for dimension dim if split is needed if (dim != -1) { @@ -3238,7 +3242,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) { // Allocate buffer to hold kernel arguments if (isGraphCapture) { - argBuffer = vcmd->getKernArgOffset(); + argBuffer = currCmd_->getKernArgOffset(); + currCmd_->SetKernelName(gpuKernel.name()); } else { argBuffer = reinterpret_cast
( @@ -3324,18 +3329,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; } - if (vcmd == nullptr) { + if (isGraphCapture) { // Dispatch the packet if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder, - (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS), - GPU_FLUSH_ON_EXECUTION)) { + (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS), + GPU_FLUSH_ON_EXECUTION, currCmd_->getCapturingState(), + currCmd_->getAqlPacket())) { return false; } } else { if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder, (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS), - GPU_FLUSH_ON_EXECUTION, vcmd->getCapturingState(), - vcmd->getAqlPacket())) { + GPU_FLUSH_ON_EXECUTION)) { return false; } } diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 053c9751b7..8ae66fde31 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -575,5 +575,7 @@ class VirtualGPU : public device::VirtualDevice { std::atomic lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask using KernelArgImpl = device::Settings::KernelArgImpl; + + amd::Command* currCmd_ = nullptr; //!< Current command under capture }; } diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index fa8c315b57..af70bb28ab 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -256,6 +256,10 @@ class Command : public Event { std::vector data_; const Event* waitingEvent_; //!< Waiting event associated with the marker + bool capturing_ = false; //!< Flag to enable/disable graph gpu packet capture + uint8_t* gpuPacket_ = nullptr; //!< GPU packet to capture, when graph capturing is enabled + address kernArgOffset_ = nullptr; //!< KernelArg buffer to used when graph capturing is enabled + std::string* capturedKernelName_ = nullptr; //!< Kenrnel under capture protected: bool cpu_wait_ = false; //!< If true, then the command was issued for CPU/GPU sync @@ -292,6 +296,31 @@ class Command : public Event { } public: + //! Returns AQL buffer state + bool getCapturingState() const { return capturing_; } + + //! Sets AQL capture state, aql packet to capture and where to copy kernArgs + void setCapturingState(bool state, uint8_t* packet, address kernArgOffset, + std::string* capturedKernelName) { + capturing_ = state; + gpuPacket_ = packet; + kernArgOffset_ = kernArgOffset; + capturedKernelName_ = capturedKernelName; + } + + //! Updates kernel name with the captured kernel name + void SetKernelName(const std::string& kernelName) { + if (capturedKernelName_ != nullptr) { + *capturedKernelName_ = kernelName; + } + } + + //! returns the graph executable object command belongs to. + const uint8_t* getAqlPacket() const { return gpuPacket_; } + + //! returns the graph executable object command belongs to. + const address getKernArgOffset() const { return kernArgOffset_; } + //! Overload new/delete for fast commands allocation/destruction void* operator new(size_t size); void operator delete(void* ptr); @@ -1075,10 +1104,6 @@ class NDRangeKernelCommand : public Command { uint32_t firstDevice_; //!< Device index of the first device in the gridc uint32_t numWorkgroups_; //!< Total number of workgroups in the current launch - bool capturing_ = false; //!< Flag to enable/disable graph gpu packet capture - uint8_t* gpuPacket_ = nullptr; //!< GPU packet to capture, when graph capturing is enabled - address kernArgOffset_ = nullptr; //!< KernelArg buffer to used when graph capturing is enabled - public: enum { CooperativeGroups = 0x01, @@ -1086,22 +1111,6 @@ class NDRangeKernelCommand : public Command { AnyOrderLaunch = 0x04, }; - //! Returns AQL buffer state - bool getCapturingState() const { return capturing_; } - - //! Sets AQL capture state, aql packet to capture and where to copy kernArgs - void setCapturingState(bool state, uint8_t* packet, address kernArgOffset) { - capturing_ = state; - gpuPacket_ = packet; - kernArgOffset_ = kernArgOffset; - } - - //! returns the graph executable object command belongs to. - const uint8_t* getAqlPacket() const { return gpuPacket_; } - - //! returns the graph executable object command belongs to. - const address getKernArgOffset() const { return kernArgOffset_; } - //! Construct an ExecuteKernel command NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,