SWDEV-339296 - Delay hidden heap allocation till the usage
Move hidden heap creation to the kernel launch to make sure it's
allocated on the actual first usage.
Change-Id: I1b65a82fc06d9129ed45a69765bf14ea3d945b04
[ROCm/clr commit: 4975f69337]
This commit is contained in:
@@ -53,6 +53,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <list>
|
||||
#include <set>
|
||||
#include <unordered_set>
|
||||
@@ -1943,7 +1944,6 @@ class Device : public RuntimeObject {
|
||||
virtual device::UriLocator* createUriLocator() const = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
protected:
|
||||
//! Enable the specified extension
|
||||
char* getExtensionString();
|
||||
@@ -1967,7 +1967,8 @@ class Device : public RuntimeObject {
|
||||
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
|
||||
static Memory* p2p_stage_; //!< Staging resources
|
||||
|
||||
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
|
||||
std::once_flag heap_initialized_; //!< Heap buffer initialization flag
|
||||
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
|
||||
|
||||
amd::Memory* arena_mem_obj_; //!< Arena memory object
|
||||
uint64_t stack_size_{0}; //!< Device stack size
|
||||
|
||||
@@ -1161,15 +1161,6 @@ bool Device::initializeHeapResources() {
|
||||
return false;
|
||||
}
|
||||
xferQueue_->enableSyncedBlit();
|
||||
if (amd::IS_HIP) {
|
||||
// Allocate initial heap for device memory allocator
|
||||
static constexpr size_t HeapBufferSize = 128 * Ki;
|
||||
heap_buffer_ = createMemory(HeapBufferSize);
|
||||
if (heap_buffer_ == nullptr) {
|
||||
LogError("Heap buffer allocation failed!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -2322,6 +2313,22 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
|
||||
vgpusAccess().unlock();
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::HiddenHeapAlloc() {
|
||||
auto HeapAlloc = [this]()->bool {
|
||||
// Allocate initial heap for device memory allocator
|
||||
static constexpr size_t HeapBufferSize = 128 * Ki;
|
||||
heap_buffer_ = createMemory(HeapBufferSize);
|
||||
if (heap_buffer_ == nullptr) {
|
||||
LogError("Heap buffer allocation failed!");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
std::call_once(heap_initialized_, HeapAlloc);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Device::SrdManager::~SrdManager() {
|
||||
for (uint i = 0; i < pool_.size(); ++i) {
|
||||
pool_[i].buf_->unmap(nullptr);
|
||||
|
||||
@@ -626,6 +626,9 @@ class Device : public NullDevice {
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
//! Allocates hidden heap for device memory allocations
|
||||
void HiddenHeapAlloc();
|
||||
|
||||
private:
|
||||
static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
|
||||
Pal::Developer::CallbackType type, void* pCbData);
|
||||
|
||||
@@ -264,6 +264,7 @@ const HSAILProgram& HSAILKernel::prog() const {
|
||||
return reinterpret_cast<const HSAILProgram&>(prog_);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
|
||||
const amd::NDRangeContainer& sizes,
|
||||
const_address params,
|
||||
@@ -359,10 +360,14 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
|
||||
case amd::KernelParameterDescriptor::HiddenMultiGridSync:
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenHeap:
|
||||
if (gpu.dev().HeapBuffer() != nullptr) {
|
||||
// Allocate hidden heap for HIP applications only
|
||||
if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) {
|
||||
const_cast<Device&>(palDevice()).HiddenHeapAlloc();
|
||||
}
|
||||
if (palDevice().HeapBuffer() != nullptr) {
|
||||
// Add heap pointer to the code
|
||||
size_t heap_ptr = static_cast<size_t>(gpu.dev().HeapBuffer()->virtualAddress());
|
||||
gpu.addVmMemory(reinterpret_cast<Memory*>(gpu.dev().HeapBuffer()));
|
||||
size_t heap_ptr = static_cast<size_t>(palDevice().HeapBuffer()->virtualAddress());
|
||||
gpu.addVmMemory(reinterpret_cast<Memory*>(palDevice().HeapBuffer()));
|
||||
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
@@ -425,12 +430,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenPrivateBase:
|
||||
WriteAqlArgAt(hidden_arguments,
|
||||
(gpu.dev().properties().gpuMemoryProperties.privateApertureBase >> AddressShift),
|
||||
(palDevice().properties().gpuMemoryProperties.privateApertureBase >> AddressShift),
|
||||
it.size_, it.offset_);
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenSharedBase:
|
||||
WriteAqlArgAt(hidden_arguments,
|
||||
(gpu.dev().properties().gpuMemoryProperties.sharedApertureBase >> AddressShift),
|
||||
(palDevice().properties().gpuMemoryProperties.sharedApertureBase >> AddressShift),
|
||||
it.size_, it.offset_);
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenQueuePtr:
|
||||
@@ -485,6 +490,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
|
||||
return hsaDisp;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
const LightningProgram& LightningKernel::prog() const {
|
||||
return reinterpret_cast<const LightningProgram&>(prog_);
|
||||
}
|
||||
|
||||
@@ -787,20 +787,6 @@ bool Device::create() {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (amd::IS_HIP) {
|
||||
// Allocate initial heap for device memory allocator
|
||||
static constexpr size_t HeapBufferSize = 128 * Ki;
|
||||
heap_buffer_ = createMemory(HeapBufferSize);
|
||||
// Clear memory to 0 for device library logic
|
||||
if ((heap_buffer_ == nullptr) ||
|
||||
(HSA_STATUS_SUCCESS != hsa_amd_memory_fill(
|
||||
reinterpret_cast<void*>(HeapBuffer()->virtualAddress()), 0,
|
||||
HeapBufferSize / sizeof(uint32_t)))) {
|
||||
LogError("Heap buffer allocation failed!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3180,6 +3166,25 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::HiddenHeapAlloc() {
|
||||
auto HeapAllocZeroOut = [this]()->bool {
|
||||
// Allocate initial heap for device memory allocator
|
||||
static constexpr size_t HeapBufferSize = 128 * Ki;
|
||||
heap_buffer_ = createMemory(HeapBufferSize);
|
||||
// Clear memory to 0 for device library logic
|
||||
if ((heap_buffer_ == nullptr) ||
|
||||
(HSA_STATUS_SUCCESS != hsa_amd_memory_fill(
|
||||
reinterpret_cast<void*>(HeapBuffer()->virtualAddress()), 0,
|
||||
HeapBufferSize / sizeof(uint32_t)))) {
|
||||
LogError("Heap buffer allocation failed!");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
std::call_once(heap_initialized_, HeapAllocZeroOut);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
ProfilingSignal::~ProfilingSignal() {
|
||||
if (signal_.handle != 0) {
|
||||
|
||||
@@ -566,6 +566,9 @@ class Device : public NullDevice {
|
||||
//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
|
||||
bool IsValidAllocation(const void* dev_ptr, size_t size) const;
|
||||
|
||||
//! Allocates hidden heap for device memory allocations
|
||||
void HiddenHeapAlloc();
|
||||
|
||||
private:
|
||||
bool create();
|
||||
|
||||
|
||||
@@ -2679,8 +2679,10 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
|
||||
const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
|
||||
uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
|
||||
@@ -2845,6 +2847,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
break;
|
||||
}
|
||||
case amd::KernelParameterDescriptor::HiddenHeap:
|
||||
// Allocate hidden heap for HIP applications only
|
||||
if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) {
|
||||
const_cast<Device&>(dev()).HiddenHeapAlloc();
|
||||
}
|
||||
if (dev().HeapBuffer() != nullptr) {
|
||||
// Add heap pointer to the code
|
||||
size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
|
||||
|
||||
Reference in New Issue
Block a user