SWDEV-307185 - Create heap for device memory allocator

Pass the allocated heap with the kernel arguments

Change-Id: Icdec09b7f937845c39e21cbca7071dc3ba791af9
This commit is contained in:
German Andryeyev
2022-03-02 19:19:29 -05:00
bovenliggende a6bcb4435a
commit 7b114a2b8b
11 gewijzigde bestanden met toevoegingen van 103 en 29 verwijderingen
+12 -1
Bestand weergeven
@@ -1,4 +1,4 @@
/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -458,6 +458,7 @@ Device::Device()
blitProgram_(nullptr),
hwDebugMgr_(nullptr),
context_(nullptr),
heap_buffer_(nullptr),
arena_mem_obj_(nullptr),
vaCacheAccess_(nullptr),
vaCacheMap_(nullptr),
@@ -471,6 +472,11 @@ Device::~Device() {
delete vaCacheMap_;
}
if (heap_buffer_ != nullptr) {
delete heap_buffer_;
heap_buffer_ = nullptr;
}
delete vaCacheAccess_;
if (arena_mem_obj_ != nullptr) {
@@ -517,6 +523,11 @@ bool Device::create(const Isa &isa) {
if (nullptr == vaCacheMap_) {
return false;
}
if (amd::IS_HIP) {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 1024 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
}
return true;
}
+9 -1
Bestand weergeven
@@ -1625,6 +1625,9 @@ class Device : public RuntimeObject {
//! Allocate a chunk of device memory as a cache for a CL memory object
virtual device::Memory* createMemory(Memory& owner) const = 0;
//! Allocate a chunk of device memory without owner class
virtual device::Memory* createMemory(size_t size) const = 0;
//! Allocate a device sampler object
virtual bool createSampler(const Sampler&, device::Sampler**) const = 0;
@@ -1852,6 +1855,9 @@ class Device : public RuntimeObject {
//! Staging buffer for P2P transfer
Memory* P2PStage() const { return p2p_stage_; }
//! Returns heap buffer object for device allocator
device::Memory* HeapBuffer() const { return heap_buffer_; }
//! Does this device allow P2P access?
bool P2PAccessAllowed() const { return (p2p_access_devices_.size() > 0) ? true : false; }
@@ -1908,7 +1914,9 @@ class Device : public RuntimeObject {
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
static Memory* p2p_stage_; //!< Staging resources
amd::Memory* arena_mem_obj_; //!< Arena memory object
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
amd::Memory* arena_mem_obj_; //!< Arena memory object
private:
const Isa *isa_; //!< Device isa
+17 -15
Bestand weergeven
@@ -50,21 +50,22 @@ struct KernelParameterDescriptor {
HiddenDefaultQueue = 12,
HiddenCompletionAction = 13,
HiddenMultiGridSync = 14,
HiddenHostcallBuffer = 15,
HiddenBlockCountX = 16,
HiddenBlockCountY = 17,
HiddenBlockCountZ = 18,
HiddenGroupSizeX = 19,
HiddenGroupSizeY = 20,
HiddenGroupSizeZ = 21,
HiddenRemainderX = 22,
HiddenRemainderY = 23,
HiddenRemainderZ = 24,
HiddenGridDims = 25,
HiddenPrivateBase = 26,
HiddenSharedBase = 27,
HiddenQueuePtr = 28,
HiddenLast = 29
HiddenHeap = 15,
HiddenHostcallBuffer = 16,
HiddenBlockCountX = 17,
HiddenBlockCountY = 18,
HiddenBlockCountZ = 19,
HiddenGroupSizeX = 20,
HiddenGroupSizeY = 21,
HiddenGroupSizeZ = 22,
HiddenRemainderX = 23,
HiddenRemainderY = 24,
HiddenRemainderZ = 25,
HiddenGridDims = 26,
HiddenPrivateBase = 27,
HiddenSharedBase = 28,
HiddenQueuePtr = 29,
HiddenLast = 30
};
clk_value_type_t type_; //!< The parameter's type
size_t offset_; //!< Its offset in the parameter's stack
@@ -276,6 +277,7 @@ static const std::map<std::string, uint32_t> ArgValueKindV3 = {
{"hidden_default_queue", amd::KernelParameterDescriptor::HiddenDefaultQueue},
{"hidden_completion_action", amd::KernelParameterDescriptor::HiddenCompletionAction},
{"hidden_multigrid_sync_arg", amd::KernelParameterDescriptor::HiddenMultiGridSync},
{"hidden_heap", amd::KernelParameterDescriptor::HiddenHeap},
{"hidden_hostcall_buffer", amd::KernelParameterDescriptor::HiddenHostcallBuffer},
{"hidden_block_count_x", amd::KernelParameterDescriptor::HiddenBlockCountX},
{"hidden_block_count_y", amd::KernelParameterDescriptor::HiddenBlockCountY},
+12 -1
Bestand weergeven
@@ -1675,7 +1675,7 @@ pal::Memory* Device::createImage(amd::Memory& owner, bool directAccess) const {
return gpuImage;
}
//! Allocates cache memory on the card
// ================================================================================================
device::Memory* Device::createMemory(amd::Memory& owner) const {
bool directAccess = false;
pal::Memory* memory = nullptr;
@@ -1706,6 +1706,17 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
return memory;
}
// ================================================================================================
device::Memory* Device::createMemory(size_t size) const {
auto buffer = new pal::Memory(*this, size);
if ((buffer == nullptr) || !buffer->create(Resource::Local)) {
LogError("Couldn't allocate memory on device!");
return nullptr;
}
return buffer;
}
// ================================================================================================
bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const {
*sampler = nullptr;
Sampler* gpuSampler = new Sampler(*this);
+4
Bestand weergeven
@@ -90,6 +90,8 @@ class NullDevice : public amd::Device {
//! Just returns NULL for the dummy device
virtual device::Memory* createMemory(amd::Memory& owner) const { return nullptr; }
//! Just returns NULL for the dummy device
virtual device::Memory* createMemory(size_t size) const { return nullptr; }
//! Sampler object allocation
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
@@ -365,6 +367,8 @@ class Device : public NullDevice {
//! Memory allocation
virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object
) const;
virtual device::Memory* createMemory(size_t size //!< Size of memory allocation
) const;
//! Sampler object allocation
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
+8
Bestand weergeven
@@ -358,6 +358,14 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
break;
case amd::KernelParameterDescriptor::HiddenMultiGridSync:
break;
case amd::KernelParameterDescriptor::HiddenHeap:
if (gpu.dev().HeapBuffer() != nullptr) {
// Add heap pointer to the code
size_t heap_ptr = static_cast<size_t>(gpu.dev().HeapBuffer()->virtualAddress());
gpu.addVmMemory(reinterpret_cast<Memory*>(gpu.dev().HeapBuffer()));
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
}
break;
case amd::KernelParameterDescriptor::HiddenBlockCountX:
WriteAqlArgAt(hidden_arguments, static_cast<uint32_t>(global[0] / local[0]),
it.size_, it.offset_);
+12 -2
Bestand weergeven
@@ -1,4 +1,4 @@
/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -1086,7 +1086,6 @@ bool Device::populateOCLDeviceConstants() {
info_.uuid_[i] = unique_id[i+4];
}
}
if (HSA_STATUS_SUCCESS !=
hsa_agent_get_info(_bkendDevice,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT,
@@ -1867,6 +1866,17 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
return memory;
}
// ================================================================================================
device::Memory* Device::createMemory(size_t size) const {
auto buffer = new roc::Buffer(*this, size);
static constexpr bool LocalAlloc = true;
if ((buffer == nullptr) || !buffer->create(LocalAlloc)) {
LogError("Couldn't allocate memory on device!");
return nullptr;
}
return buffer;
}
// ================================================================================================
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* ptr = nullptr;
+5
Bestand weergeven
@@ -163,6 +163,10 @@ class NullDevice : public amd::Device {
ShouldNotReachHere();
return nullptr;
}
virtual device::Memory* createMemory(size_t size) const {
ShouldNotReachHere();
return nullptr;
}
//! Sampler object allocation
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
@@ -366,6 +370,7 @@ class Device : public NullDevice {
virtual device::Program* createProgram(amd::Program& owner, amd::option::Options* options = nullptr);
virtual device::Memory* createMemory(amd::Memory& owner) const;
virtual device::Memory* createMemory(size_t size) const;
//! Sampler object allocation
virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object
+14 -6
Bestand weergeven
@@ -708,12 +708,20 @@ void Buffer::destroy() {
}
// ================================================================================================
bool Buffer::create() {
bool Buffer::create(bool alloc_local) {
if (owner() == nullptr) {
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
if (deviceMemory_ != nullptr) {
flags_ |= HostMemoryDirectAccess;
return true;
if (alloc_local) {
deviceMemory_ = dev().deviceLocalAlloc(size());
if (deviceMemory_ != nullptr) {
flags_ |= HostMemoryDirectAccess;
return true;
}
} else {
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
if (deviceMemory_ != nullptr) {
flags_ |= HostMemoryDirectAccess;
return true;
}
}
return false;
}
@@ -1120,7 +1128,7 @@ bool Image::createInteropImage() {
return true;
}
bool Image::create() {
bool Image::create(bool alloc_local) {
if (owner()->parent() != nullptr) {
if (!ValidateMemory()) {
return false;
+3 -3
Bestand weergeven
@@ -60,7 +60,7 @@ class Memory : public device::Memory {
uint mapFlags, size_t* rowPitch, size_t* slicePitch) override;
// Create device memory according to OpenCL memory flag.
virtual bool create() = 0;
virtual bool create(bool local_alloc = false) = 0;
// Pins system memory associated with this memory object.
bool pinSystemMemory(void* hostPtr, // System memory address
@@ -169,7 +169,7 @@ class Buffer : public roc::Memory {
virtual ~Buffer();
// Create device memory according to OpenCL memory flag.
virtual bool create();
virtual bool create(bool local_alloc = false);
// Recreate the device memory using new size and alignment.
bool recreate(size_t newSize, size_t newAlignment, bool forceSystem);
@@ -198,7 +198,7 @@ class Image : public roc::Memory {
virtual ~Image();
//! Create device memory according to OpenCL memory flag.
virtual bool create();
virtual bool create(bool local_alloc = false);
//! Create an image view
bool createView(const Memory& parent);
@@ -2829,6 +2829,13 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
WriteAqlArgAt(hidden_arguments, gridSync, it.size_, it.offset_);
break;
}
case amd::KernelParameterDescriptor::HiddenHeap:
if (dev().HeapBuffer() != nullptr) {
// Add heap pointer to the code
size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
}
break;
case amd::KernelParameterDescriptor::HiddenBlockCountX:
WriteAqlArgAt(hidden_arguments, static_cast<uint32_t>(newGlobalSize[0] / local[0]),
it.size_, it.offset_);