From 291f079669f1755ea8e2b59dd06bc5211de205a1 Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Tue, 11 Jun 2024 11:59:05 +0000 Subject: [PATCH] SWDEV-467102 - Hidden heap init for graph capture If the graph has kernels that does device side allocation, during packet capture, heap is allocated because heap pointer has to be added to the AQL packet, and initialized during graph launch. Handle race with wait when 2 kernels with device heap are enqueued on multiple streams. Change-Id: I45933b77fcaf7bc8fdf1bc906462e32b5d8d3688 [ROCm/clr commit: 57156c524d491579c1a4517c250146c7810410c3] --- .../clr/hipamd/src/hip_graph_internal.cpp | 25 ++++++++++++++----- .../clr/hipamd/src/hip_graph_internal.hpp | 5 ++++ projects/clr/rocclr/device/device.hpp | 7 ++++-- projects/clr/rocclr/device/pal/palvirtual.hpp | 2 ++ projects/clr/rocclr/device/rocm/rocblit.cpp | 3 +-- projects/clr/rocclr/device/rocm/rocdevice.cpp | 18 ++++++++++--- projects/clr/rocclr/device/rocm/rocdevice.hpp | 3 ++- .../clr/rocclr/device/rocm/rocvirtual.cpp | 9 +++++-- .../clr/rocclr/device/rocm/rocvirtual.hpp | 2 ++ 9 files changed, 57 insertions(+), 17 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 6ea32142b9..3adda8cf31 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -358,8 +358,7 @@ void GetKernelArgSizeForGraph(std::vector>& parallelLists, // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists) { for (auto& node : list) { - if (node->GetType() == hipGraphNodeTypeKernel && - !reinterpret_cast(node)->HasHiddenHeap()) { + if (node->GetType() == hipGraphNodeTypeKernel) { kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); } else if (node->GetType() == hipGraphNodeTypeGraph) { auto& childParallelLists = reinterpret_cast(node)->GetParallelLists(); @@ -375,8 +374,13 @@ hipError_t AllocKernelArgForGraph(std::vector& topoOrder, hip::Stream hip::GraphExec* graphExec) { hipError_t status = hipSuccess; for (auto& node : topoOrder) { - if (node->GetType() == hipGraphNodeTypeKernel && - !reinterpret_cast(node)->HasHiddenHeap()) { + if (node->GetType() == hipGraphNodeTypeKernel) { + // Check if graph requires hidden heap and set as part of graphExec param. + static bool initialized = false; + if (!initialized && reinterpret_cast(node)->HasHiddenHeap()) { + graphExec->SetHiddenHeap(); + initialized = true; + } auto kernelNode = reinterpret_cast(node); // From the kernel pool allocate the kern arg size required for the current kernel node. address kernArgOffset = nullptr; @@ -591,8 +595,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr); } for (int i = 0; i < topoOrder.size(); i++) { - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel && - !reinterpret_cast(topoOrder[i])->HasHiddenHeap()) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) { if (topoOrder[i]->GetEnabled()) { hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(), topoOrder[i]->GetKernelName(), @@ -640,6 +643,16 @@ hipError_t GraphExec::Run(hipStream_t stream) { if (parallelLists_.size() == 1 && instantiateDeviceId_ == hip_stream->DeviceId()) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { + // If the graph has kernels that does device side allocation, during packet capture, heap is + // allocated because heap pointer has to be added to the AQL packet, and initialized during + // graph launch. + static bool initialized = false; + if (!initialized && HasHiddenHeap()) { + hip_stream->vdev()->HiddenHeapInit(); + initialized = true; + } + } status = EnqueueGraphWithSingleList(topoOrder_, hip_stream, this); } else if (parallelLists_.size() == 1 && instantiateDeviceId_ != hip_stream->DeviceId()) { diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 60a75d429d..3255724ad1 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -569,6 +569,7 @@ struct GraphExec : public amd::ReferenceCountedObject { uint32_t kernarg_graph_cur_offset_ = 0; uint32_t kernarg_graph_size_ = 128 * Ki; int instantiateDeviceId_ = -1; + bool hasHiddenHeap_ = false; //!< Kernel has hidden heap(device side allocation) public: GraphExec(std::vector& topoOrder, std::vector>& lists, @@ -619,6 +620,10 @@ struct GraphExec : public amd::ReferenceCountedObject { } return clonedNode; } + // returns if graph has nodes that require hidden heap/not + bool HasHiddenHeap() const { return hasHiddenHeap_; } + // Graph has nodes that require hidden heap. + void SetHiddenHeap() { hasHiddenHeap_ = true; } address allocKernArg(size_t size, size_t alignment) { assert(alignment != 0); diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 6989a1cb5e..e92de9ad6c 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1308,7 +1308,8 @@ class VirtualDevice : public amd::HeapObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; - + //! Init hidden heap for device memory allocations + virtual void HiddenHeapInit() = 0; //! Dispatch captured AQL packet virtual bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, @@ -2102,7 +2103,9 @@ class Device : public RuntimeObject { static Memory* p2p_stage_; //!< Staging resources std::vector enabled_p2p_devices_; //!< List of user enabled P2P devices for this device - std::once_flag heap_initialized_; //!< Heap buffer initialization flag + std::once_flag heap_initialized_; //!< Heap buffer initialization flag + std::once_flag heap_allocated_; //!< Heap buffer allocation flag + device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device amd::Memory* arena_mem_obj_; //!< Arena memory object diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 0670ace213..1d1ac66cbe 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -357,6 +357,8 @@ class VirtualGPU : public device::VirtualDevice { bool isFenceDirty() const { return false; } + void HiddenHeapInit() {} + inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, amd::AccumulateCommand* vcmd = nullptr) { vcmd->addKernelName(kernelName); diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 791fd98de9..ebcc9e6c30 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -2621,8 +2621,7 @@ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Mem address parameters = captureArguments(kernels_[blitType]); result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); releaseArguments(parameters); - synchronize(); - + gpu().releaseGpuMemoryFence(); return result; } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 57438e6f1f..1cc442795c 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -3511,7 +3511,7 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer // ================================================================================================ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { - auto HeapAllocZeroOut = [this, &gpu]() -> bool { + auto HeapAllocOnly = [this, &gpu]() -> bool { // Allocate initial heap for device memory allocator static constexpr size_t HeapBufferSize = 128 * Ki; heap_buffer_ = createMemory(HeapBufferSize); @@ -3523,12 +3523,22 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { LogError("Heap buffer allocation failed!"); return false; } - bool result = static_cast(gpu.blitMgr()).initHeap( - heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi)); + return true; + }; + std::call_once(heap_allocated_, HeapAllocOnly); +} + +// ================================================================================================ +void Device::HiddenHeapInit(const VirtualGPU& gpu) { + auto HeapZeroOut = [this, &gpu]() -> bool { + static constexpr size_t HeapBufferSize = 128 * Ki; + bool result = static_cast(gpu.blitMgr()) + .initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize, + initial_heap_size_ / (2 * Mi)); return result; }; - std::call_once(heap_initialized_, HeapAllocZeroOut); + std::call_once(heap_initialized_, HeapZeroOut); } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 8cc95be562..6d1b03c30d 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -594,7 +594,8 @@ class Device : public NullDevice { //! Allocates hidden heap for device memory allocations void HiddenHeapAlloc(const VirtualGPU& gpu); - + //! Init hidden heap for device memory allocations + void HiddenHeapInit(const VirtualGPU& gpu); uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const; void resetSDMAMask(const device::BlitManager* handle) const; void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 34878c2bbb..5d580d3b6a 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2962,6 +2962,8 @@ static inline void nontemporalMemcpy( #endif } +void VirtualGPU::HiddenHeapInit() { const_cast(dev()).HiddenHeapInit(*this); } + // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, @@ -3016,7 +3018,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, amd::Memory* const* memories = reinterpret_cast(parameters + kernelParams.memoryObjOffset()); - + bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); for (int j = 0; j < iteration; j++) { // Reset global size for dimension dim if split is needed if (dim != -1) { @@ -3143,6 +3145,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const_cast(dev()).HiddenHeapAlloc(*this); } if (dev().HeapBuffer() != nullptr) { + // Initialize hidden heap buffer + if (!isGraphCapture) { + const_cast(dev()).HiddenHeapInit(*this); + } // Add heap pointer to the code size_t heap_ptr = static_cast(dev().HeapBuffer()->virtualAddress()); WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_); @@ -3225,7 +3231,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, } address argBuffer = hidden_arguments; - bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize()); // Find all parameters for the current kernel diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index abb67689bd..053c9751b7 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -419,6 +419,8 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } + void HiddenHeapInit(); + void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } // } roc OpenCL integration