diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 4e2f034a3c..5b0e883735 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -1943,7 +1944,6 @@ class Device : public RuntimeObject { virtual device::UriLocator* createUriLocator() const = 0; #endif #endif - protected: //! Enable the specified extension char* getExtensionString(); @@ -1967,7 +1967,8 @@ class Device : public RuntimeObject { static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources static Memory* p2p_stage_; //!< Staging resources - device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device + std::once_flag heap_initialized_; //!< Heap buffer initialization flag + device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device amd::Memory* arena_mem_obj_; //!< Arena memory object uint64_t stack_size_{0}; //!< Device stack size diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index d8e38f55dd..644d40fcd0 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -1161,15 +1161,6 @@ bool Device::initializeHeapResources() { return false; } xferQueue_->enableSyncedBlit(); - if (amd::IS_HIP) { - // Allocate initial heap for device memory allocator - static constexpr size_t HeapBufferSize = 128 * Ki; - heap_buffer_ = createMemory(HeapBufferSize); - if (heap_buffer_ == nullptr) { - LogError("Heap buffer allocation failed!"); - return false; - } - } } return true; } @@ -2322,6 +2313,22 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const { vgpusAccess().unlock(); } +// ================================================================================================ +void Device::HiddenHeapAlloc() { + auto HeapAlloc = [this]()->bool { + // Allocate initial heap for device memory allocator + static constexpr size_t HeapBufferSize = 128 * Ki; + heap_buffer_ = createMemory(HeapBufferSize); + if (heap_buffer_ == nullptr) { + LogError("Heap buffer allocation failed!"); + return false; + } + return true; + }; + std::call_once(heap_initialized_, HeapAlloc); +} + +// ================================================================================================ Device::SrdManager::~SrdManager() { for (uint i = 0; i < pool_.size(); ++i) { pool_[i].buf_->unmap(nullptr); diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp index 36c3ee5b00..833ad9a493 100644 --- a/projects/clr/rocclr/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/device/pal/paldevice.hpp @@ -626,6 +626,9 @@ class Device : public NullDevice { } #endif #endif + //! Allocates hidden heap for device memory allocations + void HiddenHeapAlloc(); + private: static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex, Pal::Developer::CallbackType type, void* pCbData); diff --git a/projects/clr/rocclr/device/pal/palkernel.cpp b/projects/clr/rocclr/device/pal/palkernel.cpp index e39f87b371..c9bd217fbd 100644 --- a/projects/clr/rocclr/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/device/pal/palkernel.cpp @@ -264,6 +264,7 @@ const HSAILProgram& HSAILKernel::prog() const { return reinterpret_cast(prog_); } +// ================================================================================================ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, const_address params, @@ -359,10 +360,14 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const case amd::KernelParameterDescriptor::HiddenMultiGridSync: break; case amd::KernelParameterDescriptor::HiddenHeap: - if (gpu.dev().HeapBuffer() != nullptr) { + // Allocate hidden heap for HIP applications only + if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) { + const_cast(palDevice()).HiddenHeapAlloc(); + } + if (palDevice().HeapBuffer() != nullptr) { // Add heap pointer to the code - size_t heap_ptr = static_cast(gpu.dev().HeapBuffer()->virtualAddress()); - gpu.addVmMemory(reinterpret_cast(gpu.dev().HeapBuffer())); + size_t heap_ptr = static_cast(palDevice().HeapBuffer()->virtualAddress()); + gpu.addVmMemory(reinterpret_cast(palDevice().HeapBuffer())); WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_); } break; @@ -425,12 +430,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const break; case amd::KernelParameterDescriptor::HiddenPrivateBase: WriteAqlArgAt(hidden_arguments, - (gpu.dev().properties().gpuMemoryProperties.privateApertureBase >> AddressShift), + (palDevice().properties().gpuMemoryProperties.privateApertureBase >> AddressShift), it.size_, it.offset_); break; case amd::KernelParameterDescriptor::HiddenSharedBase: WriteAqlArgAt(hidden_arguments, - (gpu.dev().properties().gpuMemoryProperties.sharedApertureBase >> AddressShift), + (palDevice().properties().gpuMemoryProperties.sharedApertureBase >> AddressShift), it.size_, it.offset_); break; case amd::KernelParameterDescriptor::HiddenQueuePtr: @@ -485,6 +490,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const return hsaDisp; } +// ================================================================================================ const LightningProgram& LightningKernel::prog() const { return reinterpret_cast(prog_); } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index d178d7164a..36770841fa 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -787,20 +787,6 @@ bool Device::create() { return false; } - if (amd::IS_HIP) { - // Allocate initial heap for device memory allocator - static constexpr size_t HeapBufferSize = 128 * Ki; - heap_buffer_ = createMemory(HeapBufferSize); - // Clear memory to 0 for device library logic - if ((heap_buffer_ == nullptr) || - (HSA_STATUS_SUCCESS != hsa_amd_memory_fill( - reinterpret_cast(HeapBuffer()->virtualAddress()), 0, - HeapBufferSize / sizeof(uint32_t)))) { - LogError("Heap buffer allocation failed!"); - return false; - } - } - return true; } @@ -3180,6 +3166,25 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const { return false; } +// ================================================================================================ +void Device::HiddenHeapAlloc() { + auto HeapAllocZeroOut = [this]()->bool { + // Allocate initial heap for device memory allocator + static constexpr size_t HeapBufferSize = 128 * Ki; + heap_buffer_ = createMemory(HeapBufferSize); + // Clear memory to 0 for device library logic + if ((heap_buffer_ == nullptr) || + (HSA_STATUS_SUCCESS != hsa_amd_memory_fill( + reinterpret_cast(HeapBuffer()->virtualAddress()), 0, + HeapBufferSize / sizeof(uint32_t)))) { + LogError("Heap buffer allocation failed!"); + return false; + } + return true; + }; + std::call_once(heap_initialized_, HeapAllocZeroOut); +} + // ================================================================================================ ProfilingSignal::~ProfilingSignal() { if (signal_.handle != 0) { diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index a6a6631710..44f416bbef 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -566,6 +566,9 @@ class Device : public NullDevice { //! Returns True if memory pointer is known to ROCr (excludes HMM allocations) bool IsValidAllocation(const void* dev_ptr, size_t size) const; + //! Allocates hidden heap for device memory allocations + void HiddenHeapAlloc(); + private: bool create(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 3f25f31fd2..926cbfc319 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2679,8 +2679,10 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) return true; } -bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, - const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) { +// ================================================================================================ +bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, + const amd::Kernel& kernel, const_address parameters, void* eventHandle, + uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); @@ -2845,6 +2847,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const break; } case amd::KernelParameterDescriptor::HiddenHeap: + // Allocate hidden heap for HIP applications only + if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) { + const_cast(dev()).HiddenHeapAlloc(); + } if (dev().HeapBuffer() != nullptr) { // Add heap pointer to the code size_t heap_ptr = static_cast(dev().HeapBuffer()->virtualAddress());