diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index db4c8f72b5..bf3e4635fb 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -411,6 +411,13 @@ class GpuAgent : public GpuAgentInt { const std::function& system_deallocator() const { return system_deallocator_; } + const std::function& + finegrain_allocator() const { + return finegrain_allocator_; + } + + const std::function& finegrain_deallocator() const { return finegrain_deallocator_; } + protected: // Sizes are in packets. static const uint32_t minAqlSize_ = 0x40; // 4KB min @@ -581,8 +588,8 @@ class GpuAgent : public GpuAgentInt { // @brief Setup GWS accessing queue. void InitGWS(); - // @brief Setup NUMA aware system memory allocator. - void InitNumaAllocator(); + // @brief Set-up memory allocators + void InitAllocators(); // @brief Initialize scratch handler thresholds void InitAsyncScratchThresholds(); @@ -657,6 +664,10 @@ class GpuAgent : public GpuAgentInt { std::function system_deallocator_; + // Fine grain allocator on this device + std::function finegrain_allocator_; + + std::function finegrain_deallocator_; // @brief device handle amdgpu_device_handle ldrm_dev_; diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index 0d774dece0..adc2d16452 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -96,7 +96,7 @@ class MemoryRegion : public core::MemoryRegion { static void MakeKfdMemoryUnresident(const void* ptr); MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain, - core::Agent* owner, const HsaMemoryProperties& mem_props); + bool user_visible, core::Agent* owner, const HsaMemoryProperties& mem_props); ~MemoryRegion(); diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h index f36b195e7e..6cd127b894 100644 --- a/runtime/hsa-runtime/core/inc/memory_region.h +++ b/runtime/hsa-runtime/core/inc/memory_region.h @@ -58,11 +58,12 @@ class Agent; class MemoryRegion : public Checked<0x9C961F19EE175BB3> { public: MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain, - core::Agent* owner) + bool user_visible, core::Agent* owner) : fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), extended_scope_fine_grain_(extended_scope_fine_grain), + user_visible_(user_visible), owner_(owner) { assert(owner_ != NULL); } @@ -132,6 +133,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { __forceinline bool full_profile() const { return full_profile_; } + __forceinline bool user_visible() const { return user_visible_; } + __forceinline core::Agent* owner() const { return owner_; } private: @@ -139,6 +142,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { const bool kernarg_; const bool full_profile_; const bool extended_scope_fine_grain_; + const bool user_visible_; + core::Agent* owner_; }; } // namespace core diff --git a/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp index 7e68c7d23b..df473d4219 100644 --- a/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp @@ -85,15 +85,15 @@ void CpuAgent::InitRegionList() { if (system_prop != mem_props.end()) system_props = *system_prop; MemoryRegion* system_region_fine = - new MemoryRegion(true, false, is_apu_node, false, this, system_props); + new MemoryRegion(true, false, is_apu_node, false, true, this, system_props); regions_.push_back(system_region_fine); MemoryRegion* system_region_kernarg = - new MemoryRegion(true, true, is_apu_node, false, this, system_props); + new MemoryRegion(true, true, is_apu_node, false, true, this, system_props); regions_.push_back(system_region_kernarg); if (!is_apu_node) { MemoryRegion* system_region_coarse = - new MemoryRegion(false, false, is_apu_node, false, this, system_props); + new MemoryRegion(false, false, is_apu_node, false, true, this, system_props); regions_.push_back(system_region_coarse); } } @@ -152,6 +152,7 @@ hsa_status_t CpuAgent::VisitRegion( hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const { for (const core::MemoryRegion* region : regions) { + if (!region->user_visible()) continue; hsa_region_t region_handle = core::MemoryRegion::Convert(region); hsa_status_t status = callback(region_handle, data); if (status != HSA_STATUS_SUCCESS) { diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 455d3eedd0..1c179f06ec 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -448,19 +448,20 @@ void GpuAgent::InitRegionList() { case HSA_HEAPTYPE_GPU_LDS: case HSA_HEAPTYPE_GPU_SCRATCH: { MemoryRegion* region = - new MemoryRegion(false, false, false, false, this, mem_props[mem_idx]); + new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]); regions_.push_back(region); if (region->IsLocalMemory()) { regions_.push_back( - new MemoryRegion(false, false, false, true, this, mem_props[mem_idx])); + new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx])); + // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. - if ((properties_.HiveID != 0) || - (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) { - regions_.push_back( - new MemoryRegion(true, false, false, false, this, mem_props[mem_idx])); - } + bool user_visible = (properties_.HiveID != 0) || + core::Runtime::runtime_singleton_->flag().fine_grain_pcie(); + + regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this, + mem_props[mem_idx])); } break; } @@ -650,6 +651,8 @@ hsa_status_t GpuAgent::VisitRegion( void* data) const { AMD::callback_t call(callback); for (const core::MemoryRegion* region : regions) { + if (!region->user_visible()) continue; + const AMD::MemoryRegion* amd_region = reinterpret_cast(region); @@ -850,7 +853,7 @@ void GpuAgent::PreloadBlits() { hsa_status_t GpuAgent::PostToolsInit() { // Defer memory allocation until agents have been discovered. - InitNumaAllocator(); + InitAllocators(); InitScratchPool(); BindTrapHandler(); InitDma(); @@ -2241,7 +2244,7 @@ void GpuAgent::Trim() { scratch_cache_.trim(false); } -void GpuAgent::InitNumaAllocator() { +void GpuAgent::InitAllocators() { for (auto pool : GetNearestCpuAgent()->regions()) { if (pool->kernarg()) { system_allocator_ = [pool](size_t size, size_t alignment, @@ -2255,11 +2258,29 @@ void GpuAgent::InitNumaAllocator() { }; system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; - - return; } } - assert(false && "Nearest NUMA node did not have a kernarg pool."); + assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool."); + + // Setup fine-grain allocator + for (auto region : regions()) { + const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region; + if (amd_region->IsLocalMemory() && amd_region->fine_grain()) { + finegrain_allocator_ = [region](size_t size, + MemoryRegion::AllocateFlags alloc_flags) -> void* { + void* ptr = nullptr; + return (HSA_STATUS_SUCCESS == + core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr)) + ? ptr + : nullptr; + }; + + finegrain_deallocator_ = [](void* ptr) { + core::Runtime::runtime_singleton_->FreeMemory(ptr); + }; + } + } + assert(finegrain_deallocator_ && "Agent does not have a fine-grain allocator"); } core::Agent* GpuAgent::GetNearestCpuAgent() const { diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index f28434d80e..213484c83a 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -102,9 +102,10 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { } MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, - bool extended_scope_fine_grain, core::Agent* owner, + bool extended_scope_fine_grain, bool user_visible, core::Agent* owner, const HsaMemoryProperties& mem_props) - : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, owner), + : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, user_visible, + owner), mem_props_(mem_props), max_single_alloc_size_(0), virtual_size_(0),