SWDEV-307185 - Create heap for device memory allocator
Pass the allocated heap with the kernel arguments Change-Id: Icdec09b7f937845c39e21cbca7071dc3ba791af9
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -458,6 +458,7 @@ Device::Device()
|
||||
blitProgram_(nullptr),
|
||||
hwDebugMgr_(nullptr),
|
||||
context_(nullptr),
|
||||
heap_buffer_(nullptr),
|
||||
arena_mem_obj_(nullptr),
|
||||
vaCacheAccess_(nullptr),
|
||||
vaCacheMap_(nullptr),
|
||||
@@ -471,6 +472,11 @@ Device::~Device() {
|
||||
delete vaCacheMap_;
|
||||
}
|
||||
|
||||
if (heap_buffer_ != nullptr) {
|
||||
delete heap_buffer_;
|
||||
heap_buffer_ = nullptr;
|
||||
}
|
||||
|
||||
delete vaCacheAccess_;
|
||||
|
||||
if (arena_mem_obj_ != nullptr) {
|
||||
@@ -517,6 +523,11 @@ bool Device::create(const Isa &isa) {
|
||||
if (nullptr == vaCacheMap_) {
|
||||
return false;
|
||||
}
|
||||
if (amd::IS_HIP) {
|
||||
// Allocate initial heap for device memory allocator
|
||||
static constexpr size_t HeapBufferSize = 1024 * Ki;
|
||||
heap_buffer_ = createMemory(HeapBufferSize);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -1625,6 +1625,9 @@ class Device : public RuntimeObject {
|
||||
//! Allocate a chunk of device memory as a cache for a CL memory object
|
||||
virtual device::Memory* createMemory(Memory& owner) const = 0;
|
||||
|
||||
//! Allocate a chunk of device memory without owner class
|
||||
virtual device::Memory* createMemory(size_t size) const = 0;
|
||||
|
||||
//! Allocate a device sampler object
|
||||
virtual bool createSampler(const Sampler&, device::Sampler**) const = 0;
|
||||
|
||||
@@ -1852,6 +1855,9 @@ class Device : public RuntimeObject {
|
||||
//! Staging buffer for P2P transfer
|
||||
Memory* P2PStage() const { return p2p_stage_; }
|
||||
|
||||
//! Returns heap buffer object for device allocator
|
||||
device::Memory* HeapBuffer() const { return heap_buffer_; }
|
||||
|
||||
//! Does this device allow P2P access?
|
||||
bool P2PAccessAllowed() const { return (p2p_access_devices_.size() > 0) ? true : false; }
|
||||
|
||||
@@ -1908,7 +1914,9 @@ class Device : public RuntimeObject {
|
||||
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
|
||||
static Memory* p2p_stage_; //!< Staging resources
|
||||
|
||||
amd::Memory* arena_mem_obj_; //!< Arena memory object
|
||||
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
|
||||
|
||||
amd::Memory* arena_mem_obj_; //!< Arena memory object
|
||||
|
||||
private:
|
||||
const Isa *isa_; //!< Device isa
|
||||
|
||||
@@ -50,21 +50,22 @@ struct KernelParameterDescriptor {
|
||||
HiddenDefaultQueue = 12,
|
||||
HiddenCompletionAction = 13,
|
||||
HiddenMultiGridSync = 14,
|
||||
HiddenHostcallBuffer = 15,
|
||||
HiddenBlockCountX = 16,
|
||||
HiddenBlockCountY = 17,
|
||||
HiddenBlockCountZ = 18,
|
||||
HiddenGroupSizeX = 19,
|
||||
HiddenGroupSizeY = 20,
|
||||
HiddenGroupSizeZ = 21,
|
||||
HiddenRemainderX = 22,
|
||||
HiddenRemainderY = 23,
|
||||
HiddenRemainderZ = 24,
|
||||
HiddenGridDims = 25,
|
||||
HiddenPrivateBase = 26,
|
||||
HiddenSharedBase = 27,
|
||||
HiddenQueuePtr = 28,
|
||||
HiddenLast = 29
|
||||
HiddenHeap = 15,
|
||||
HiddenHostcallBuffer = 16,
|
||||
HiddenBlockCountX = 17,
|
||||
HiddenBlockCountY = 18,
|
||||
HiddenBlockCountZ = 19,
|
||||
HiddenGroupSizeX = 20,
|
||||
HiddenGroupSizeY = 21,
|
||||
HiddenGroupSizeZ = 22,
|
||||
HiddenRemainderX = 23,
|
||||
HiddenRemainderY = 24,
|
||||
HiddenRemainderZ = 25,
|
||||
HiddenGridDims = 26,
|
||||
HiddenPrivateBase = 27,
|
||||
HiddenSharedBase = 28,
|
||||
HiddenQueuePtr = 29,
|
||||
HiddenLast = 30
|
||||
};
|
||||
clk_value_type_t type_; //!< The parameter's type
|
||||
size_t offset_; //!< Its offset in the parameter's stack
|
||||
@@ -276,6 +277,7 @@ static const std::map<std::string, uint32_t> ArgValueKindV3 = {
|
||||
{"hidden_default_queue", amd::KernelParameterDescriptor::HiddenDefaultQueue},
|
||||
{"hidden_completion_action", amd::KernelParameterDescriptor::HiddenCompletionAction},
|
||||
{"hidden_multigrid_sync_arg", amd::KernelParameterDescriptor::HiddenMultiGridSync},
|
||||
{"hidden_heap", amd::KernelParameterDescriptor::HiddenHeap},
|
||||
{"hidden_hostcall_buffer", amd::KernelParameterDescriptor::HiddenHostcallBuffer},
|
||||
{"hidden_block_count_x", amd::KernelParameterDescriptor::HiddenBlockCountX},
|
||||
{"hidden_block_count_y", amd::KernelParameterDescriptor::HiddenBlockCountY},
|
||||
|
||||
@@ -1675,7 +1675,7 @@ pal::Memory* Device::createImage(amd::Memory& owner, bool directAccess) const {
|
||||
return gpuImage;
|
||||
}
|
||||
|
||||
//! Allocates cache memory on the card
|
||||
// ================================================================================================
|
||||
device::Memory* Device::createMemory(amd::Memory& owner) const {
|
||||
bool directAccess = false;
|
||||
pal::Memory* memory = nullptr;
|
||||
@@ -1706,6 +1706,17 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
|
||||
return memory;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
device::Memory* Device::createMemory(size_t size) const {
|
||||
auto buffer = new pal::Memory(*this, size);
|
||||
if ((buffer == nullptr) || !buffer->create(Resource::Local)) {
|
||||
LogError("Couldn't allocate memory on device!");
|
||||
return nullptr;
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const {
|
||||
*sampler = nullptr;
|
||||
Sampler* gpuSampler = new Sampler(*this);
|
||||
|
||||
@@ -90,6 +90,8 @@ class NullDevice : public amd::Device {
|
||||
|
||||
//! Just returns NULL for the dummy device
|
||||
virtual device::Memory* createMemory(amd::Memory& owner) const { return nullptr; }
|
||||
//! Just returns NULL for the dummy device
|
||||
virtual device::Memory* createMemory(size_t size) const { return nullptr; }
|
||||
|
||||
//! Sampler object allocation
|
||||
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
|
||||
@@ -365,6 +367,8 @@ class Device : public NullDevice {
|
||||
//! Memory allocation
|
||||
virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object
|
||||
) const;
|
||||
virtual device::Memory* createMemory(size_t size //!< Size of memory allocation
|
||||
) const;
|
||||
|
||||
//! Sampler object allocation
|
||||
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
|
||||
|
||||
@@ -358,6 +358,14 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenMultiGridSync:
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenHeap:
|
||||
if (gpu.dev().HeapBuffer() != nullptr) {
|
||||
// Add heap pointer to the code
|
||||
size_t heap_ptr = static_cast<size_t>(gpu.dev().HeapBuffer()->virtualAddress());
|
||||
gpu.addVmMemory(reinterpret_cast<Memory*>(gpu.dev().HeapBuffer()));
|
||||
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenBlockCountX:
|
||||
WriteAqlArgAt(hidden_arguments, static_cast<uint32_t>(global[0] / local[0]),
|
||||
it.size_, it.offset_);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -1086,7 +1086,6 @@ bool Device::populateOCLDeviceConstants() {
|
||||
info_.uuid_[i] = unique_id[i+4];
|
||||
}
|
||||
}
|
||||
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(_bkendDevice,
|
||||
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT,
|
||||
@@ -1867,6 +1866,17 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
|
||||
return memory;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
device::Memory* Device::createMemory(size_t size) const {
|
||||
auto buffer = new roc::Buffer(*this, size);
|
||||
static constexpr bool LocalAlloc = true;
|
||||
if ((buffer == nullptr) || !buffer->create(LocalAlloc)) {
|
||||
LogError("Couldn't allocate memory on device!");
|
||||
return nullptr;
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
void* ptr = nullptr;
|
||||
|
||||
@@ -163,6 +163,10 @@ class NullDevice : public amd::Device {
|
||||
ShouldNotReachHere();
|
||||
return nullptr;
|
||||
}
|
||||
virtual device::Memory* createMemory(size_t size) const {
|
||||
ShouldNotReachHere();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
//! Sampler object allocation
|
||||
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
|
||||
@@ -366,6 +370,7 @@ class Device : public NullDevice {
|
||||
virtual device::Program* createProgram(amd::Program& owner, amd::option::Options* options = nullptr);
|
||||
|
||||
virtual device::Memory* createMemory(amd::Memory& owner) const;
|
||||
virtual device::Memory* createMemory(size_t size) const;
|
||||
|
||||
//! Sampler object allocation
|
||||
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
|
||||
|
||||
@@ -708,12 +708,20 @@ void Buffer::destroy() {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Buffer::create() {
|
||||
bool Buffer::create(bool alloc_local) {
|
||||
if (owner() == nullptr) {
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
|
||||
if (deviceMemory_ != nullptr) {
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
return true;
|
||||
if (alloc_local) {
|
||||
deviceMemory_ = dev().deviceLocalAlloc(size());
|
||||
if (deviceMemory_ != nullptr) {
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
|
||||
if (deviceMemory_ != nullptr) {
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -1120,7 +1128,7 @@ bool Image::createInteropImage() {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Image::create() {
|
||||
bool Image::create(bool alloc_local) {
|
||||
if (owner()->parent() != nullptr) {
|
||||
if (!ValidateMemory()) {
|
||||
return false;
|
||||
|
||||
@@ -60,7 +60,7 @@ class Memory : public device::Memory {
|
||||
uint mapFlags, size_t* rowPitch, size_t* slicePitch) override;
|
||||
|
||||
// Create device memory according to OpenCL memory flag.
|
||||
virtual bool create() = 0;
|
||||
virtual bool create(bool local_alloc = false) = 0;
|
||||
|
||||
// Pins system memory associated with this memory object.
|
||||
bool pinSystemMemory(void* hostPtr, // System memory address
|
||||
@@ -169,7 +169,7 @@ class Buffer : public roc::Memory {
|
||||
virtual ~Buffer();
|
||||
|
||||
// Create device memory according to OpenCL memory flag.
|
||||
virtual bool create();
|
||||
virtual bool create(bool local_alloc = false);
|
||||
|
||||
// Recreate the device memory using new size and alignment.
|
||||
bool recreate(size_t newSize, size_t newAlignment, bool forceSystem);
|
||||
@@ -198,7 +198,7 @@ class Image : public roc::Memory {
|
||||
virtual ~Image();
|
||||
|
||||
//! Create device memory according to OpenCL memory flag.
|
||||
virtual bool create();
|
||||
virtual bool create(bool local_alloc = false);
|
||||
|
||||
//! Create an image view
|
||||
bool createView(const Memory& parent);
|
||||
|
||||
@@ -2829,6 +2829,13 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
WriteAqlArgAt(hidden_arguments, gridSync, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
case amd::KernelParameterDescriptor::HiddenHeap:
|
||||
if (dev().HeapBuffer() != nullptr) {
|
||||
// Add heap pointer to the code
|
||||
size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
|
||||
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenBlockCountX:
|
||||
WriteAqlArgAt(hidden_arguments, static_cast<uint32_t>(newGlobalSize[0] / local[0]),
|
||||
it.size_, it.offset_);
|
||||
|
||||
Verwijs in nieuw issue
Block a user