Create fine-grained allocator

Create allocator helper function to provide fine-grained memory on
a specific agent.

Change-Id: I32ba9aceb9c9dc708b140a0c45158e6e7a018844
This commit is contained in:
David Yat Sin
2023-08-23 20:50:36 +00:00
والد 721e56ef5c
کامیت 71f1a6726c
6فایلهای تغییر یافته به همراه60 افزوده شده و 21 حذف شده
@@ -411,6 +411,13 @@ class GpuAgent : public GpuAgentInt {
const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }
const std::function<void*(size_t size, core::MemoryRegion::AllocateFlags flags)>&
finegrain_allocator() const {
return finegrain_allocator_;
}
const std::function<void(void*)>& finegrain_deallocator() const { return finegrain_deallocator_; }
protected:
// Sizes are in packets.
static const uint32_t minAqlSize_ = 0x40; // 4KB min
@@ -581,8 +588,8 @@ class GpuAgent : public GpuAgentInt {
// @brief Setup GWS accessing queue.
void InitGWS();
// @brief Setup NUMA aware system memory allocator.
void InitNumaAllocator();
// @brief Set-up memory allocators
void InitAllocators();
// @brief Initialize scratch handler thresholds
void InitAsyncScratchThresholds();
@@ -657,6 +664,10 @@ class GpuAgent : public GpuAgentInt {
std::function<void(void*)> system_deallocator_;
// Fine grain allocator on this device
std::function<void*(size_t size, core::MemoryRegion::AllocateFlags flags)> finegrain_allocator_;
std::function<void(void*)> finegrain_deallocator_;
// @brief device handle
amdgpu_device_handle ldrm_dev_;
@@ -96,7 +96,7 @@ class MemoryRegion : public core::MemoryRegion {
static void MakeKfdMemoryUnresident(const void* ptr);
MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
core::Agent* owner, const HsaMemoryProperties& mem_props);
bool user_visible, core::Agent* owner, const HsaMemoryProperties& mem_props);
~MemoryRegion();
@@ -58,11 +58,12 @@ class Agent;
class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
public:
MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
core::Agent* owner)
bool user_visible, core::Agent* owner)
: fine_grain_(fine_grain),
kernarg_(kernarg),
full_profile_(full_profile),
extended_scope_fine_grain_(extended_scope_fine_grain),
user_visible_(user_visible),
owner_(owner) {
assert(owner_ != NULL);
}
@@ -132,6 +133,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
__forceinline bool full_profile() const { return full_profile_; }
__forceinline bool user_visible() const { return user_visible_; }
__forceinline core::Agent* owner() const { return owner_; }
private:
@@ -139,6 +142,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
const bool kernarg_;
const bool full_profile_;
const bool extended_scope_fine_grain_;
const bool user_visible_;
core::Agent* owner_;
};
} // namespace core
@@ -85,15 +85,15 @@ void CpuAgent::InitRegionList() {
if (system_prop != mem_props.end()) system_props = *system_prop;
MemoryRegion* system_region_fine =
new MemoryRegion(true, false, is_apu_node, false, this, system_props);
new MemoryRegion(true, false, is_apu_node, false, true, this, system_props);
regions_.push_back(system_region_fine);
MemoryRegion* system_region_kernarg =
new MemoryRegion(true, true, is_apu_node, false, this, system_props);
new MemoryRegion(true, true, is_apu_node, false, true, this, system_props);
regions_.push_back(system_region_kernarg);
if (!is_apu_node) {
MemoryRegion* system_region_coarse =
new MemoryRegion(false, false, is_apu_node, false, this, system_props);
new MemoryRegion(false, false, is_apu_node, false, true, this, system_props);
regions_.push_back(system_region_coarse);
}
}
@@ -152,6 +152,7 @@ hsa_status_t CpuAgent::VisitRegion(
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const {
for (const core::MemoryRegion* region : regions) {
if (!region->user_visible()) continue;
hsa_region_t region_handle = core::MemoryRegion::Convert(region);
hsa_status_t status = callback(region_handle, data);
if (status != HSA_STATUS_SUCCESS) {
@@ -448,19 +448,20 @@ void GpuAgent::InitRegionList() {
case HSA_HEAPTYPE_GPU_LDS:
case HSA_HEAPTYPE_GPU_SCRATCH: {
MemoryRegion* region =
new MemoryRegion(false, false, false, false, this, mem_props[mem_idx]);
new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]);
regions_.push_back(region);
if (region->IsLocalMemory()) {
regions_.push_back(
new MemoryRegion(false, false, false, true, this, mem_props[mem_idx]));
new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx]));
// Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
if ((properties_.HiveID != 0) ||
(core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) {
regions_.push_back(
new MemoryRegion(true, false, false, false, this, mem_props[mem_idx]));
}
bool user_visible = (properties_.HiveID != 0) ||
core::Runtime::runtime_singleton_->flag().fine_grain_pcie();
regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this,
mem_props[mem_idx]));
}
break;
}
@@ -650,6 +651,8 @@ hsa_status_t GpuAgent::VisitRegion(
void* data) const {
AMD::callback_t<decltype(callback)> call(callback);
for (const core::MemoryRegion* region : regions) {
if (!region->user_visible()) continue;
const AMD::MemoryRegion* amd_region =
reinterpret_cast<const AMD::MemoryRegion*>(region);
@@ -850,7 +853,7 @@ void GpuAgent::PreloadBlits() {
hsa_status_t GpuAgent::PostToolsInit() {
// Defer memory allocation until agents have been discovered.
InitNumaAllocator();
InitAllocators();
InitScratchPool();
BindTrapHandler();
InitDma();
@@ -2241,7 +2244,7 @@ void GpuAgent::Trim() {
scratch_cache_.trim(false);
}
void GpuAgent::InitNumaAllocator() {
void GpuAgent::InitAllocators() {
for (auto pool : GetNearestCpuAgent()->regions()) {
if (pool->kernarg()) {
system_allocator_ = [pool](size_t size, size_t alignment,
@@ -2255,11 +2258,29 @@ void GpuAgent::InitNumaAllocator() {
};
system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
return;
}
}
assert(false && "Nearest NUMA node did not have a kernarg pool.");
assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool.");
// Setup fine-grain allocator
for (auto region : regions()) {
const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region;
if (amd_region->IsLocalMemory() && amd_region->fine_grain()) {
finegrain_allocator_ = [region](size_t size,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
void* ptr = nullptr;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr))
? ptr
: nullptr;
};
finegrain_deallocator_ = [](void* ptr) {
core::Runtime::runtime_singleton_->FreeMemory(ptr);
};
}
}
assert(finegrain_deallocator_ && "Agent does not have a fine-grain allocator");
}
core::Agent* GpuAgent::GetNearestCpuAgent() const {
@@ -102,9 +102,10 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
}
MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
bool extended_scope_fine_grain, core::Agent* owner,
bool extended_scope_fine_grain, bool user_visible, core::Agent* owner,
const HsaMemoryProperties& mem_props)
: core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, owner),
: core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, user_visible,
owner),
mem_props_(mem_props),
max_single_alloc_size_(0),
virtual_size_(0),