diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 01fa1cbfde..d2dae376f7 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -479,6 +479,18 @@ class GpuAgent : public GpuAgentInt { const std::function& finegrain_deallocator() const { return finegrain_deallocator_; } + /// @brief Allocate coarse grain device memory on this GPU agent. + const std::function& + coarsegrain_allocator() const { + return coarsegrain_allocator_; + } + + /// @brief Deallocate memory allocated from the coarsegrain_allocator + /// on this GPU agent. + const std::function& coarsegrain_deallocator() const { + return coarsegrain_deallocator_; + } + protected: // Sizes are in packets. const uint32_t minAqlSize_ = 0x40; // 4KB min @@ -731,16 +743,19 @@ class GpuAgent : public GpuAgentInt { ScratchCache scratch_cache_; - // System memory allocator in the nearest NUMA node. + /// @brief System memory allocator in the nearest NUMA node. std::function system_allocator_; - + /// @brief System memory deallocator in the nearest NUMA node. std::function system_deallocator_; - - // Fine grain allocator on this device + /// @brief Fine-grain allocator on this GPU. std::function finegrain_allocator_; - + /// @brief Fine-grain deallocator on this GPU. std::function finegrain_deallocator_; + /// @brief Coarse-grain allocator on this GPU. + std::function coarsegrain_allocator_; + /// @brief Coarse-grain deallocator on this GPU. + std::function coarsegrain_deallocator_; void* trap_handler_tma_region_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 1b62fe707d..d1548ded38 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -2456,25 +2456,32 @@ void GpuAgent::InitAllocators() { } assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool."); - // Setup fine-grain allocator + // Setup this GPU's fine-grain and coarse-grain allocators. for (auto region : regions()) { - const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region; - if (amd_region->IsLocalMemory() && amd_region->fine_grain()) { - finegrain_allocator_ = [region](size_t size, - MemoryRegion::AllocateFlags alloc_flags) -> void* { - void* ptr = nullptr; - return (HSA_STATUS_SUCCESS == - core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr)) - ? ptr - : nullptr; - }; + const AMD::MemoryRegion* amd_region = static_cast(region); - finegrain_deallocator_ = [](void* ptr) { - core::Runtime::runtime_singleton_->FreeMemory(ptr); - }; + auto region_allocator = [region](size_t size, + MemoryRegion::AllocateFlags alloc_flags) -> void* { + void* ptr = nullptr; + return (HSA_STATUS_SUCCESS == + core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr)) + ? ptr + : nullptr; + }; + + auto region_deallocator = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; + + if (amd_region->IsLocalMemory() && amd_region->fine_grain()) { + finegrain_allocator_ = region_allocator; + finegrain_deallocator_ = region_deallocator; + } else if (amd_region->IsLocalMemory() && + !(amd_region->fine_grain() || amd_region->extended_scope_fine_grain())) { + coarsegrain_allocator_ = region_allocator; + coarsegrain_deallocator_ = region_deallocator; } } - assert(finegrain_deallocator_ && "Agent does not have a fine-grain allocator"); + assert(finegrain_allocator_ && "GPU agent does not have a fine-grain allocator"); + assert(coarsegrain_allocator_ && "GPU agent does not have a coarse-grain allocator"); } core::Agent* GpuAgent::GetNearestCpuAgent() const {