From 49cbeaea216a7ab9fcf9c37d4571ef986ace992a Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Mon, 12 Feb 2024 16:34:47 +0000 Subject: [PATCH] SWDEV-444988 - Fix __amd_rocclr_initHeap sync with DEBUG_CLR_GRAPH_PACKET_CAPTURE When kernel does device side malloc, initial heap is allocated with __amd_rocclr_initHeap. During graph launch kernel __amd_rocclr_initHeap is enqueued followed by actual kernel . So kernel will execute after initHeap kernel. But with graph optimizations during capture initHeap gets enqueued on device null stream and actual kernel on graph launch stream. So no proper synchronization. Switch to command creation and enqueue during launch for kernel node with hidden heap. Change-Id: Iaf600251faef9a448853f19429023c118aa760b9 [ROCm/clr commit: 2dc6ec68a5b6594adc5cb2c19729808d5431c30e] --- .../clr/hipamd/src/hip_graph_internal.cpp | 34 +++++++++++-------- .../clr/hipamd/src/hip_graph_internal.hpp | 9 +++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index e297766928..9a4455ad89 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -351,7 +351,8 @@ hipError_t GraphExec::CaptureAQLPackets() { // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists_) { for (auto& node : list) { - if (node->GetType() == hipGraphNodeTypeKernel) { + if (node->GetType() == hipGraphNodeTypeKernel && + !reinterpret_cast(node)->HasHiddenHeap()) { kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); } } @@ -373,7 +374,8 @@ hipError_t GraphExec::CaptureAQLPackets() { kernarg_pool_size_graph_ = kernArgSizeForGraph; } for (auto& node : topoOrder_) { - if (node->GetType() == hipGraphNodeTypeKernel) { + if (node->GetType() == hipGraphNodeTypeKernel && + !reinterpret_cast(node)->HasHiddenHeap()) { auto kernelNode = reinterpret_cast(node); // From the kernel pool allocate the kern arg size required for the current kernel node. address kernArgOffset = nullptr; @@ -560,16 +562,20 @@ hipError_t GraphExec::Run(hipStream_t stream) { if (parallelLists_.size() == 1 && instantiateDeviceId_ == hip_stream->DeviceId()) { amd::AccumulateCommand* accumulate = nullptr; - bool isLastPacketKernel = false; + bool isLastKernelWithoutHiddenHeap = + ((topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) && + !reinterpret_cast(topoOrder_.back())->HasHiddenHeap()); if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { - uint8_t* lastCapturedPacket = (topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) - ? topoOrder_.back()->GetAqlPacket() - : nullptr; - accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket); + uint8_t* lastCapturedPacket = + isLastKernelWithoutHiddenHeap ? topoOrder_.back()->GetAqlPacket() : nullptr; + if (topoOrder_.back()->GetEnabled()) { + accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket); + } } for (int i = 0; i < topoOrder_.size() - 1; i++) { - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel && + !reinterpret_cast(topoOrder_[i])->HasHiddenHeap()) { if (topoOrder_[i]->GetEnabled()) { hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate); accumulate->addKernelName(topoOrder_[i]->GetKernelName()); @@ -583,20 +589,18 @@ hipError_t GraphExec::Run(hipStream_t stream) { // If last captured packet is kernel, optimize to detect completion of last kernel // This saves on extra packet submitted to determine end of graph - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && isLastKernelWithoutHiddenHeap) { // Add the last kernel node name to the accumulate command - accumulate->addKernelName(topoOrder_.back()->GetKernelName()); - accumulate->enqueue(); - accumulate->release(); - isLastPacketKernel = true; + if (topoOrder_.back()->GetEnabled()) { + accumulate->addKernelName(topoOrder_.back()->GetKernelName()); + } } else { topoOrder_.back()->SetStream(hip_stream, this); status = topoOrder_.back()->CreateCommand(topoOrder_.back()->GetQueue()); topoOrder_.back()->EnqueueCommands(stream); } - // If last packet is not kernel, submit a marker to detect end of graph - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && !isLastPacketKernel) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { accumulate->enqueue(); accumulate->release(); } diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index ac9f775069..e9a996ff46 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -785,11 +785,14 @@ class GraphKernelNode : public GraphNode { size_t alignedKernArgSize_; //!< Aligned size required for kernel args size_t kernargSegmentByteSize_; //!< Kernel arg segment byte size size_t kernargSegmentAlignment_; //!< Kernel arg segment alignment + bool hasHiddenHeap_; //!< Kernel has hidden heap(device side allocation) + public: size_t GetKerArgSize() const { return alignedKernArgSize_; } size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; } size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; } + bool HasHiddenHeap() const { return hasHiddenHeap_; } void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override { out << "["; out << "style"; @@ -926,6 +929,11 @@ class GraphKernelNode : public GraphNode { } ::memcpy(kernelParams_.kernelParams[i], (pNodeParams->kernelParams[i]), desc.size_); } + for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) { + if (signature.at(i).info_.oclObject_ == amd::KernelParameterDescriptor::HiddenHeap) { + hasHiddenHeap_ = true; + } + } } // Allocate/assign memory if params are passed as part of 'extra' @@ -969,6 +977,7 @@ class GraphKernelNode : public GraphNode { } memset(&kernelAttr_, 0, sizeof(kernelAttr_)); kernelAttrInUse_ = 0; + hasHiddenHeap_ = false; } ~GraphKernelNode() { freeParams(); }