SWDEV-339296 - Delay hidden heap allocation till the usage

Move hidden heap creation to the kernel launch to make sure it's
allocated on the actual first usage.

Change-Id: I1b65a82fc06d9129ed45a69765bf14ea3d945b04


[ROCm/clr commit: 4975f69337]
This commit is contained in:
German Andryeyev
2022-06-14 12:18:34 -04:00
rodzic 2b3296a4ef
commit acf2856677
7 zmienionych plików z 63 dodań i 32 usunięć
+3 -2
Wyświetl plik
@@ -53,6 +53,7 @@
#include <string>
#include <vector>
#include <map>
#include <mutex>
#include <list>
#include <set>
#include <unordered_set>
@@ -1943,7 +1944,6 @@ class Device : public RuntimeObject {
virtual device::UriLocator* createUriLocator() const = 0;
#endif
#endif
protected:
//! Enable the specified extension
char* getExtensionString();
@@ -1967,7 +1967,8 @@ class Device : public RuntimeObject {
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
static Memory* p2p_stage_; //!< Staging resources
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
std::once_flag heap_initialized_; //!< Heap buffer initialization flag
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
amd::Memory* arena_mem_obj_; //!< Arena memory object
uint64_t stack_size_{0}; //!< Device stack size
@@ -1161,15 +1161,6 @@ bool Device::initializeHeapResources() {
return false;
}
xferQueue_->enableSyncedBlit();
if (amd::IS_HIP) {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 128 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
if (heap_buffer_ == nullptr) {
LogError("Heap buffer allocation failed!");
return false;
}
}
}
return true;
}
@@ -2322,6 +2313,22 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
vgpusAccess().unlock();
}
// ================================================================================================
void Device::HiddenHeapAlloc() {
auto HeapAlloc = [this]()->bool {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 128 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
if (heap_buffer_ == nullptr) {
LogError("Heap buffer allocation failed!");
return false;
}
return true;
};
std::call_once(heap_initialized_, HeapAlloc);
}
// ================================================================================================
Device::SrdManager::~SrdManager() {
for (uint i = 0; i < pool_.size(); ++i) {
pool_[i].buf_->unmap(nullptr);
@@ -626,6 +626,9 @@ class Device : public NullDevice {
}
#endif
#endif
//! Allocates hidden heap for device memory allocations
void HiddenHeapAlloc();
private:
static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
Pal::Developer::CallbackType type, void* pCbData);
@@ -264,6 +264,7 @@ const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast<const HSAILProgram&>(prog_);
}
// ================================================================================================
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
const amd::NDRangeContainer& sizes,
const_address params,
@@ -359,10 +360,14 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
case amd::KernelParameterDescriptor::HiddenMultiGridSync:
break;
case amd::KernelParameterDescriptor::HiddenHeap:
if (gpu.dev().HeapBuffer() != nullptr) {
// Allocate hidden heap for HIP applications only
if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) {
const_cast<Device&>(palDevice()).HiddenHeapAlloc();
}
if (palDevice().HeapBuffer() != nullptr) {
// Add heap pointer to the code
size_t heap_ptr = static_cast<size_t>(gpu.dev().HeapBuffer()->virtualAddress());
gpu.addVmMemory(reinterpret_cast<Memory*>(gpu.dev().HeapBuffer()));
size_t heap_ptr = static_cast<size_t>(palDevice().HeapBuffer()->virtualAddress());
gpu.addVmMemory(reinterpret_cast<Memory*>(palDevice().HeapBuffer()));
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
}
break;
@@ -425,12 +430,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
break;
case amd::KernelParameterDescriptor::HiddenPrivateBase:
WriteAqlArgAt(hidden_arguments,
(gpu.dev().properties().gpuMemoryProperties.privateApertureBase >> AddressShift),
(palDevice().properties().gpuMemoryProperties.privateApertureBase >> AddressShift),
it.size_, it.offset_);
break;
case amd::KernelParameterDescriptor::HiddenSharedBase:
WriteAqlArgAt(hidden_arguments,
(gpu.dev().properties().gpuMemoryProperties.sharedApertureBase >> AddressShift),
(palDevice().properties().gpuMemoryProperties.sharedApertureBase >> AddressShift),
it.size_, it.offset_);
break;
case amd::KernelParameterDescriptor::HiddenQueuePtr:
@@ -485,6 +490,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
return hsaDisp;
}
// ================================================================================================
const LightningProgram& LightningKernel::prog() const {
return reinterpret_cast<const LightningProgram&>(prog_);
}
@@ -787,20 +787,6 @@ bool Device::create() {
return false;
}
if (amd::IS_HIP) {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 128 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
// Clear memory to 0 for device library logic
if ((heap_buffer_ == nullptr) ||
(HSA_STATUS_SUCCESS != hsa_amd_memory_fill(
reinterpret_cast<void*>(HeapBuffer()->virtualAddress()), 0,
HeapBufferSize / sizeof(uint32_t)))) {
LogError("Heap buffer allocation failed!");
return false;
}
}
return true;
}
@@ -3180,6 +3166,25 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
return false;
}
// ================================================================================================
void Device::HiddenHeapAlloc() {
auto HeapAllocZeroOut = [this]()->bool {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 128 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
// Clear memory to 0 for device library logic
if ((heap_buffer_ == nullptr) ||
(HSA_STATUS_SUCCESS != hsa_amd_memory_fill(
reinterpret_cast<void*>(HeapBuffer()->virtualAddress()), 0,
HeapBufferSize / sizeof(uint32_t)))) {
LogError("Heap buffer allocation failed!");
return false;
}
return true;
};
std::call_once(heap_initialized_, HeapAllocZeroOut);
}
// ================================================================================================
ProfilingSignal::~ProfilingSignal() {
if (signal_.handle != 0) {
@@ -566,6 +566,9 @@ class Device : public NullDevice {
//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
bool IsValidAllocation(const void* dev_ptr, size_t size) const;
//! Allocates hidden heap for device memory allocations
void HiddenHeapAlloc();
private:
bool create();
@@ -2679,8 +2679,10 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
return true;
}
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -2845,6 +2847,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
break;
}
case amd::KernelParameterDescriptor::HiddenHeap:
// Allocate hidden heap for HIP applications only
if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) {
const_cast<Device&>(dev()).HiddenHeapAlloc();
}
if (dev().HeapBuffer() != nullptr) {
// Add heap pointer to the code
size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());