diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 6ea32142b9..3adda8cf31 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -358,8 +358,7 @@ void GetKernelArgSizeForGraph(std::vector>& parallelLists, // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists) { for (auto& node : list) { - if (node->GetType() == hipGraphNodeTypeKernel && - !reinterpret_cast(node)->HasHiddenHeap()) { + if (node->GetType() == hipGraphNodeTypeKernel) { kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); } else if (node->GetType() == hipGraphNodeTypeGraph) { auto& childParallelLists = reinterpret_cast(node)->GetParallelLists(); @@ -375,8 +374,13 @@ hipError_t AllocKernelArgForGraph(std::vector& topoOrder, hip::Stream hip::GraphExec* graphExec) { hipError_t status = hipSuccess; for (auto& node : topoOrder) { - if (node->GetType() == hipGraphNodeTypeKernel && - !reinterpret_cast(node)->HasHiddenHeap()) { + if (node->GetType() == hipGraphNodeTypeKernel) { + // Check if graph requires hidden heap and set as part of graphExec param. + static bool initialized = false; + if (!initialized && reinterpret_cast(node)->HasHiddenHeap()) { + graphExec->SetHiddenHeap(); + initialized = true; + } auto kernelNode = reinterpret_cast(node); // From the kernel pool allocate the kern arg size required for the current kernel node. address kernArgOffset = nullptr; @@ -591,8 +595,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr); } for (int i = 0; i < topoOrder.size(); i++) { - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel && - !reinterpret_cast(topoOrder[i])->HasHiddenHeap()) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) { if (topoOrder[i]->GetEnabled()) { hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(), topoOrder[i]->GetKernelName(), @@ -640,6 +643,16 @@ hipError_t GraphExec::Run(hipStream_t stream) { if (parallelLists_.size() == 1 && instantiateDeviceId_ == hip_stream->DeviceId()) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { + // If the graph has kernels that does device side allocation, during packet capture, heap is + // allocated because heap pointer has to be added to the AQL packet, and initialized during + // graph launch. + static bool initialized = false; + if (!initialized && HasHiddenHeap()) { + hip_stream->vdev()->HiddenHeapInit(); + initialized = true; + } + } status = EnqueueGraphWithSingleList(topoOrder_, hip_stream, this); } else if (parallelLists_.size() == 1 && instantiateDeviceId_ != hip_stream->DeviceId()) { diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 60a75d429d..3255724ad1 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -569,6 +569,7 @@ struct GraphExec : public amd::ReferenceCountedObject { uint32_t kernarg_graph_cur_offset_ = 0; uint32_t kernarg_graph_size_ = 128 * Ki; int instantiateDeviceId_ = -1; + bool hasHiddenHeap_ = false; //!< Kernel has hidden heap(device side allocation) public: GraphExec(std::vector& topoOrder, std::vector>& lists, @@ -619,6 +620,10 @@ struct GraphExec : public amd::ReferenceCountedObject { } return clonedNode; } + // returns if graph has nodes that require hidden heap/not + bool HasHiddenHeap() const { return hasHiddenHeap_; } + // Graph has nodes that require hidden heap. + void SetHiddenHeap() { hasHiddenHeap_ = true; } address allocKernArg(size_t size, size_t alignment) { assert(alignment != 0); diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 6989a1cb5e..e92de9ad6c 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1308,7 +1308,8 @@ class VirtualDevice : public amd::HeapObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; - + //! Init hidden heap for device memory allocations + virtual void HiddenHeapInit() = 0; //! Dispatch captured AQL packet virtual bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, @@ -2102,7 +2103,9 @@ class Device : public RuntimeObject { static Memory* p2p_stage_; //!< Staging resources std::vector enabled_p2p_devices_; //!< List of user enabled P2P devices for this device - std::once_flag heap_initialized_; //!< Heap buffer initialization flag + std::once_flag heap_initialized_; //!< Heap buffer initialization flag + std::once_flag heap_allocated_; //!< Heap buffer allocation flag + device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device amd::Memory* arena_mem_obj_; //!< Arena memory object diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 0670ace213..1d1ac66cbe 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -357,6 +357,8 @@ class VirtualGPU : public device::VirtualDevice { bool isFenceDirty() const { return false; } + void HiddenHeapInit() {} + inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, amd::AccumulateCommand* vcmd = nullptr) { vcmd->addKernelName(kernelName); diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 791fd98de9..ebcc9e6c30 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -2621,8 +2621,7 @@ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Mem address parameters = captureArguments(kernels_[blitType]); result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); releaseArguments(parameters); - synchronize(); - + gpu().releaseGpuMemoryFence(); return result; } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 57438e6f1f..1cc442795c 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -3511,7 +3511,7 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer // ================================================================================================ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { - auto HeapAllocZeroOut = [this, &gpu]() -> bool { + auto HeapAllocOnly = [this, &gpu]() -> bool { // Allocate initial heap for device memory allocator static constexpr size_t HeapBufferSize = 128 * Ki; heap_buffer_ = createMemory(HeapBufferSize); @@ -3523,12 +3523,22 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { LogError("Heap buffer allocation failed!"); return false; } - bool result = static_cast(gpu.blitMgr()).initHeap( - heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi)); + return true; + }; + std::call_once(heap_allocated_, HeapAllocOnly); +} + +// ================================================================================================ +void Device::HiddenHeapInit(const VirtualGPU& gpu) { + auto HeapZeroOut = [this, &gpu]() -> bool { + static constexpr size_t HeapBufferSize = 128 * Ki; + bool result = static_cast(gpu.blitMgr()) + .initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize, + initial_heap_size_ / (2 * Mi)); return result; }; - std::call_once(heap_initialized_, HeapAllocZeroOut); + std::call_once(heap_initialized_, HeapZeroOut); } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 8cc95be562..6d1b03c30d 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -594,7 +594,8 @@ class Device : public NullDevice { //! Allocates hidden heap for device memory allocations void HiddenHeapAlloc(const VirtualGPU& gpu); - + //! Init hidden heap for device memory allocations + void HiddenHeapInit(const VirtualGPU& gpu); uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const; void resetSDMAMask(const device::BlitManager* handle) const; void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 34878c2bbb..5d580d3b6a 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2962,6 +2962,8 @@ static inline void nontemporalMemcpy( #endif } +void VirtualGPU::HiddenHeapInit() { const_cast(dev()).HiddenHeapInit(*this); } + // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, @@ -3016,7 +3018,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, amd::Memory* const* memories = reinterpret_cast(parameters + kernelParams.memoryObjOffset()); - + bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); for (int j = 0; j < iteration; j++) { // Reset global size for dimension dim if split is needed if (dim != -1) { @@ -3143,6 +3145,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const_cast(dev()).HiddenHeapAlloc(*this); } if (dev().HeapBuffer() != nullptr) { + // Initialize hidden heap buffer + if (!isGraphCapture) { + const_cast(dev()).HiddenHeapInit(*this); + } // Add heap pointer to the code size_t heap_ptr = static_cast(dev().HeapBuffer()->virtualAddress()); WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_); @@ -3225,7 +3231,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, } address argBuffer = hidden_arguments; - bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize()); // Find all parameters for the current kernel diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index abb67689bd..053c9751b7 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -419,6 +419,8 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } + void HiddenHeapInit(); + void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } // } roc OpenCL integration