diff --git a/runtime/hsa-runtime/core/inc/agent.h b/runtime/hsa-runtime/core/inc/agent.h index 08683934cf..c76064ac00 100644 --- a/runtime/hsa-runtime/core/inc/agent.h +++ b/runtime/hsa-runtime/core/inc/agent.h @@ -61,6 +61,8 @@ class Signal; typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source, void* data); +class MemoryRegion; + // Agent is intended to be an pure interface class and may be wrapped or // replaced by tools libraries. All funtions other than Convert, node_id, // device_type, and public_handle must be virtual. @@ -258,10 +260,6 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { return stat; } - virtual void Trim() { - for (auto region : regions()) region->Trim(); - } - protected: // Intention here is to have a polymorphic update procedure for public_handle_ // which is callable on any Agent* but only from some class dervied from diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index c00005ff0f..b86f706d75 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -55,7 +55,6 @@ #include "core/inc/blit.h" #include "core/inc/signal.h" #include "core/inc/cache.h" -#include "core/inc/scratch_cache.h" #include "core/util/small_heap.h" #include "core/util/locks.h" #include "core/util/lazy_ptr.h" @@ -64,7 +63,18 @@ namespace rocr { namespace AMD { class MemoryRegion; -typedef ScratchCache::ScratchInfo ScratchInfo; +// @brief Contains scratch memory information. +struct ScratchInfo { + void* queue_base; + size_t size; + size_t size_per_thread; + uint32_t lanes_per_wave; + ptrdiff_t queue_process_offset; + bool large; + bool retry; + hsa_signal_t queue_retry; + uint64_t wanted_slots; +}; // @brief Interface to represent a GPU agent. class GpuAgentInt : public core::Agent { @@ -321,8 +331,6 @@ class GpuAgent : public GpuAgentInt { return memory_max_frequency_; } - void Trim() override; - protected: static const uint32_t minAqlSize_ = 0x1000; // 4KB min static const uint32_t maxAqlSize_ = 0x20000; // 8MB max @@ -486,10 +494,6 @@ class GpuAgent : public GpuAgentInt { // @brief Deregister scratch notification signals. void ClearScratchNotifiers() { scratch_notifiers_.clear(); } - // @brief Releases scratch back to the driver. - // caller must hold scratch_lock_. - void ReleaseScratch(void* base, size_t size, bool large); - // Bind index of peer device that is connected via xGMI links lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent); @@ -513,8 +517,6 @@ class GpuAgent : public GpuAgentInt { KernelMutex lock_; } gws_queue_; - ScratchCache scratch_cache_; - DISALLOW_COPY_AND_ASSIGN(GpuAgent); }; diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index e119a939eb..f5342610fd 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -136,8 +136,6 @@ class MemoryRegion : public core::MemoryRegion { hsa_status_t AssignAgent(void* ptr, size_t size, const core::Agent& agent, hsa_access_permission_t access) const; - void Trim() const; - __forceinline bool IsLocalMemory() const { return ((mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) || (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC)); diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h index 583b13a124..4940833557 100644 --- a/runtime/hsa-runtime/core/inc/memory_region.h +++ b/runtime/hsa-runtime/core/inc/memory_region.h @@ -47,9 +47,8 @@ #include -#include "core/inc/hsa_internal.h" +#include "core/inc/agent.h" #include "core/inc/checked.h" -#include "core/util/utils.h" namespace rocr { namespace core { @@ -107,9 +106,6 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { virtual hsa_status_t AssignAgent(void* ptr, size_t size, const Agent& agent, hsa_access_permission_t access) const = 0; - // Releases any cached memory that may be held within the allocator. - virtual void Trim() const {} - __forceinline bool fine_grain() const { return fine_grain_; } __forceinline bool full_profile() const { return full_profile_; } diff --git a/runtime/hsa-runtime/core/inc/scratch_cache.h b/runtime/hsa-runtime/core/inc/scratch_cache.h deleted file mode 100644 index 5029f4781d..0000000000 --- a/runtime/hsa-runtime/core/inc/scratch_cache.h +++ /dev/null @@ -1,191 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2020-2020, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_ -#define HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_ - -#include "core/inc/amd_gpu_agent.h" -#include "core/util/locks.h" -#include "core/util/utils.h" - -#include -#include - -namespace rocr { -namespace AMD { - -class ScratchCache { - public: - struct node { - enum STATE { FREE = 0, ALLOC = 1, TRIM = 2, STEAL = 4 }; - void* base; - bool large; - uint32_t state; - - node() : base(nullptr), state(FREE) {} - - bool isFree() const { return state == FREE; } - bool trimPending() const { return state == (ALLOC | TRIM); } - - void trim() { - assert(!isFree() && "Trim of free scratch node."); - state |= TRIM; - } - void free() { - assert(!isFree() && "Free of free scratch node."); - state = FREE; - } - void alloc() { - assert(isFree() && "Alloc of non-free scratch node."); - state = ALLOC; - } - }; - - typedef ::std::multimap map_t; - typedef map_t::iterator ref_t; - typedef ::std::function deallocator_t; - - // @brief Contains scratch memory information. - struct ScratchInfo { - void* queue_base; - // Size to fill the machine with size_per_thread - size_t size; - // Size to satisfy the present dispatch without throttling. - size_t dispatch_size; - size_t size_per_thread; - uint32_t lanes_per_wave; - ptrdiff_t queue_process_offset; - bool large; - bool retry; - hsa_signal_t queue_retry; - uint64_t wanted_slots; - ScratchCache::ref_t scratch_node; - }; - - ScratchCache(const ScratchCache& rhs) = delete; - ScratchCache(ScratchCache&& rhs) = delete; - ScratchCache& operator=(const ScratchCache& rhs) = delete; - ScratchCache& operator=(ScratchCache&& rhs) = delete; - - ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {} - - ~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); } - - bool alloc(ScratchInfo& info) { - ref_t it = map.upper_bound(info.size - 1); - if (it == map.end()) return false; - - // Small requests must have an exact size match and be small. - if (!info.large) { - while ((it != map.end()) && (it->first == info.size)) { - if (it->second.isFree() && (!it->second.large)) { - it->second.alloc(); - info.queue_base = it->second.base; - info.scratch_node = it; - return true; - } - it++; - } - return false; - } - - // Large requests may use a small allocation and do not require an exact size match. - while (it != map.end()) { - if (it->second.isFree()) { - it->second.alloc(); - info.queue_base = it->second.base; - info.size = it->first; - info.scratch_node = it; - return true; - } - it++; - } - return false; - } - - void free(ScratchInfo& info) { - assert(!info.scratch_node->second.isFree() && "free called on free scratch node."); - auto it = info.scratch_node; - if (it->second.trimPending()) { - dealloc(it->second.base, it->first, it->second.large); - map.erase(it); - return; - } - it->second.free(); - } - - bool trim(bool trim_nodes_in_use) { - bool ret = !map.empty(); - auto it = map.begin(); - while (it != map.end()) { - if (it->second.isFree()) { - dealloc(it->second.base, it->first, it->second.large); - auto temp = it; - it++; - map.erase(temp); - } else { - if (trim_nodes_in_use) it->second.trim(); - it++; - } - } - return ret; - } - - void insert(ScratchInfo& info) { - node n; - n.base = info.queue_base; - n.large = info.large; - n.alloc(); - - auto it = map.insert(std::make_pair(info.size, n)); - info.scratch_node = it; - } - - private: - map_t map; - deallocator_t dealloc; -}; - -} // namespace AMD -} // namespace rocr - -#endif // header guard diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 03d5c9285f..3a4352fc70 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -792,25 +792,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { // Align whole waves to 1KB. scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave); scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave; - - uint64_t lanes_per_group = - (uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) * - pkt.dispatch.workgroup_size_z; - uint64_t waves_per_group = - (lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave; - uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) / - pkt.dispatch.workgroup_size_x) * - ((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) / - pkt.dispatch.workgroup_size_y) * - ((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) / - pkt.dispatch.workgroup_size_z); - - scratch.wanted_slots = groups * waves_per_group; +#ifndef NDEBUG + scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) * + pkt.dispatch.grid_size_z) / scratch.lanes_per_wave; scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots)); - scratch.wanted_slots = - Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks)); - scratch.dispatch_size = - scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave; +#endif queue->agent_->AcquireQueueScratch(scratch); @@ -1131,7 +1117,7 @@ void AqlQueue::InitScratchSRD() { queue_scratch_.size_per_thread) + 1023) / 1024); tmpring_size.bits.WAVESIZE = wave_scratch; assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow."); - uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024); + uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024)); tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves); amd_queue_.compute_tmpring_size = tmpring_size.u32All; return; diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index b4163a66e8..49f44e59b6 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -91,9 +91,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) memory_bus_width_(0), memory_max_frequency_(0), ape1_base_(0), - ape1_size_(0), - scratch_cache_( - [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) { + ape1_size_(0) { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -173,7 +171,6 @@ GpuAgent::~GpuAgent() { _aligned_free(reinterpret_cast(ape1_base_)); } - scratch_cache_.trim(true); if (scratch_pool_.base() != NULL) { hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size()); } @@ -1010,8 +1007,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, scratch.size_per_thread = private_segment_size; const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU; - scratch.size = - scratch.size_per_thread * properties_.MaxSlotsScratchCU * scratch.lanes_per_wave * num_cu; + scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu; scratch.queue_base = nullptr; scratch.queue_process_offset = 0; @@ -1057,166 +1053,104 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024); if (size_per_wave > MAX_WAVE_SCRATCH) return; - /* - Determine size class needed. - - Scratch allocations come in two flavors based on how it is retired. Small allocations may be - kept bound to a queue and reused by firmware. This memory can not be reclaimed by the runtime - on demand so must be kept small to avoid egregious OOM conditions. Other allocations, aka large, - may be used by firmware only for one dispatch and are then surrendered to the runtime. This has - significant latency so we don't want to make all scratch allocations large (ie single use). - - Note that the designation "large" is for contrast with "small", which must really be small - amounts of memory, and does not always imply a large quantity of memory is needed. Other - properties of the allocation may require single use and so qualify the allocation or use as - "large". - - Here we decide on the boundaries for small scratch allocations. Both the largest small single - allocation and the maximum amount of memory bound by small allocations are limited. Additionally - some legacy devices do not support large scratch. - - For small scratch we must allocate enough memory for every physical scratch slot. - For large scratch compute the minimum memory needed to run the dispatch without limiting - occupancy. - Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single - allocation. - */ + ScopedAcquire lock(&scratch_lock_); + // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue. size_t small_limit = scratch_pool_.size() >> 3; // Lift limit for 2.10 release RCCL workaround. size_t single_limit = 146800640; //small_limit >> 2; - bool use_reclaim = true; bool large = (scratch.size > single_limit) || (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit); - if ((isa_->GetMajorVersion() < 8) || - core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) { - large = false; - use_reclaim = false; - } + large = (isa_->GetMajorVersion() < 8) ? false : large; + large = core::Runtime::runtime_singleton_->flag().no_scratch_reclaim() ? false : large; + if (large) + scratch.queue_base = scratch_pool_.alloc_high(scratch.size); + else + scratch.queue_base = scratch_pool_.alloc(scratch.size); + large |= scratch.queue_base > scratch_pool_.high_split(); + scratch.large = large; - // If large is selected then the scratch will not be retained. - // In that case allocate the minimum necessary for the dispatch since we don't need all slots. - if (large) scratch.size = scratch.dispatch_size; + scratch.queue_process_offset = + (need_queue_scratch_base) + ? uintptr_t(scratch.queue_base) + : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); - // Ensure mapping will be in whole pages. - scratch.size = AlignUp(scratch.size, 4096); - - /* - Sequence of attempts is: - check cache - attempt a new allocation - trim unused blocks from cache - attempt a new allocation - check cache for sufficient used block, steal and wait (not implemented) - trim used blocks from cache, evaluate retry - reduce occupancy - */ - - // Lambda called in place. - // Used to allow exit from nested loops. - [&]() { - ScopedAcquire lock(&scratch_lock_); - // Check scratch cache - if (scratch_cache_.alloc(scratch)) return; - - // Attempt new allocation. - for (int i = 0; i < 2; i++) { - if (large) - scratch.queue_base = scratch_pool_.alloc_high(scratch.size); - else - scratch.queue_base = scratch_pool_.alloc(scratch.size); - scratch.large = large | (scratch.queue_base > scratch_pool_.high_split()); - assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled."); - - if (scratch.queue_base != nullptr) { - if (profile_ == HSA_PROFILE_FULL) return; - if (profile_ == HSA_PROFILE_BASE) { - HSAuint64 alternate_va; - if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) == - HSAKMT_STATUS_SUCCESS) { - if (scratch.large) scratch_used_large_ += scratch.size; - scratch_cache_.insert(scratch); - return; - } - } - } - - // Scratch request failed allocation or mapping. - scratch_pool_.free(scratch.queue_base); - scratch.queue_base = nullptr; - - // Release cached scratch and retry. - // First iteration trims unused blocks, second trims all. - scratch_cache_.trim(i == 1); - } - - // Retry if large may yield needed space. - if (scratch_used_large_ != 0) { - if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true; - return; - } - - // Fail scratch allocation if reducing occupancy is disabled. - if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) - return; - - // Attempt to trim the maximum number of concurrent waves to allow scratch to fit. - if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) - debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n", - scratch.size); - const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU; - const uint64_t total_waves = scratch.size / size_per_wave; - uint64_t waves_per_cu = total_waves / num_cus; - while (waves_per_cu != 0) { - size_t size = waves_per_cu * num_cus * size_per_wave; - void* base = scratch_pool_.alloc_high(size); + if (scratch.queue_base != nullptr) { + if (profile_ == HSA_PROFILE_FULL) return; + if (profile_ == HSA_PROFILE_BASE) { HSAuint64 alternate_va; - if ((base != nullptr) && - ((profile_ == HSA_PROFILE_FULL) || - (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) { - // Scratch allocated and either full profile or map succeeded. - scratch.queue_base = base; - scratch.size = size; - scratch.large = true; - scratch_used_large_ += scratch.size; - scratch_cache_.insert(scratch); - if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) - debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size, - float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f); + if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) == + HSAKMT_STATUS_SUCCESS) { + if (large) scratch_used_large_ += scratch.size; return; } - scratch_pool_.free(base); - waves_per_cu--; } + } - // Failed to allocate minimal scratch - assert(scratch.queue_base == nullptr && "bad scratch data"); - if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) - debug_print(" Could not allocate scratch for one wave per CU.\n"); + // Scratch request failed allocation or mapping. + scratch_pool_.free(scratch.queue_base); + scratch.queue_base = nullptr; + + // Retry if large may yield needed space. + if (scratch_used_large_ != 0) { + if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true; return; - }(); + } - scratch.queue_process_offset = need_queue_scratch_base - ? uintptr_t(scratch.queue_base) - : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); + // Fail scratch allocation if reducing occupancy is disabled. + if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return; + + // Attempt to trim the maximum number of concurrent waves to allow scratch to fit. + if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) + debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n", + scratch.size); + const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU; + const uint64_t total_waves = scratch.size / size_per_wave; + uint64_t waves_per_cu = total_waves / num_cus; + while (waves_per_cu != 0) { + size_t size = waves_per_cu * num_cus * size_per_wave; + void* base = scratch_pool_.alloc(size); + HSAuint64 alternate_va; + if ((base != nullptr) && + ((profile_ == HSA_PROFILE_FULL) || + (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) { + // Scratch allocated and either full profile or map succeeded. + scratch.queue_base = base; + scratch.size = size; + scratch.queue_process_offset = (need_queue_scratch_base) + ? uintptr_t(scratch.queue_base) + : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); + scratch.large = true; + scratch_used_large_ += scratch.size; + if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) + debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size, + float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f); + return; + } + scratch_pool_.free(base); + waves_per_cu--; + } + + // Failed to allocate minimal scratch + assert(scratch.queue_base == nullptr && "bad scratch data"); + if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) + debug_print(" Could not allocate scratch for one wave per CU.\n"); } void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) { - if (scratch.queue_base == nullptr) return; + if (scratch.queue_base == nullptr) { + return; + } ScopedAcquire lock(&scratch_lock_); - scratch_cache_.free(scratch); - scratch.queue_base = nullptr; -} - -void GpuAgent::ReleaseScratch(void* base, size_t size, bool large) { if (profile_ == HSA_PROFILE_BASE) { - if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) { + if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) { assert(false && "Unmap scratch subrange failed!"); } } - scratch_pool_.free(base); + scratch_pool_.free(scratch.queue_base); + scratch.queue_base = nullptr; - if (large) scratch_used_large_ -= size; + if (scratch.large) scratch_used_large_ -= scratch.size; // Notify waiters that additional scratch may be available. for (auto notifier : scratch_notifiers_) { @@ -1510,11 +1444,5 @@ lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, return GetXgmiBlit(dst_agent); } -void GpuAgent::Trim() { - Agent::Trim(); - ScopedAcquire lock(&scratch_lock_); - scratch_cache_.trim(false); -} - } // namespace amd } // namespace rocr diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index fca4398664..43581e0923 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -201,7 +201,7 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi // If it fails attempt to release memory from the block allocator and retry. *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); if (*address == nullptr) { - owner()->Trim(); + fragment_allocator_.trim(); *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); } @@ -699,8 +699,6 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size, return HSA_STATUS_SUCCESS; } -void MemoryRegion::Trim() const { fragment_allocator_.trim(); } - void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const { assert(request_size <= block_size() && "BlockAllocator alloc request exceeds block size.");