diff --git a/runtime/hsa-runtime/core/inc/agent.h b/runtime/hsa-runtime/core/inc/agent.h index c76064ac00..08683934cf 100644 --- a/runtime/hsa-runtime/core/inc/agent.h +++ b/runtime/hsa-runtime/core/inc/agent.h @@ -61,8 +61,6 @@ class Signal; typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source, void* data); -class MemoryRegion; - // Agent is intended to be an pure interface class and may be wrapped or // replaced by tools libraries. All funtions other than Convert, node_id, // device_type, and public_handle must be virtual. @@ -260,6 +258,10 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { return stat; } + virtual void Trim() { + for (auto region : regions()) region->Trim(); + } + protected: // Intention here is to have a polymorphic update procedure for public_handle_ // which is callable on any Agent* but only from some class dervied from diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index b86f706d75..c00005ff0f 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -55,6 +55,7 @@ #include "core/inc/blit.h" #include "core/inc/signal.h" #include "core/inc/cache.h" +#include "core/inc/scratch_cache.h" #include "core/util/small_heap.h" #include "core/util/locks.h" #include "core/util/lazy_ptr.h" @@ -63,18 +64,7 @@ namespace rocr { namespace AMD { class MemoryRegion; -// @brief Contains scratch memory information. -struct ScratchInfo { - void* queue_base; - size_t size; - size_t size_per_thread; - uint32_t lanes_per_wave; - ptrdiff_t queue_process_offset; - bool large; - bool retry; - hsa_signal_t queue_retry; - uint64_t wanted_slots; -}; +typedef ScratchCache::ScratchInfo ScratchInfo; // @brief Interface to represent a GPU agent. class GpuAgentInt : public core::Agent { @@ -331,6 +321,8 @@ class GpuAgent : public GpuAgentInt { return memory_max_frequency_; } + void Trim() override; + protected: static const uint32_t minAqlSize_ = 0x1000; // 4KB min static const uint32_t maxAqlSize_ = 0x20000; // 8MB max @@ -494,6 +486,10 @@ class GpuAgent : public GpuAgentInt { // @brief Deregister scratch notification signals. void ClearScratchNotifiers() { scratch_notifiers_.clear(); } + // @brief Releases scratch back to the driver. + // caller must hold scratch_lock_. + void ReleaseScratch(void* base, size_t size, bool large); + // Bind index of peer device that is connected via xGMI links lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent); @@ -517,6 +513,8 @@ class GpuAgent : public GpuAgentInt { KernelMutex lock_; } gws_queue_; + ScratchCache scratch_cache_; + DISALLOW_COPY_AND_ASSIGN(GpuAgent); }; diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index f5342610fd..e119a939eb 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -136,6 +136,8 @@ class MemoryRegion : public core::MemoryRegion { hsa_status_t AssignAgent(void* ptr, size_t size, const core::Agent& agent, hsa_access_permission_t access) const; + void Trim() const; + __forceinline bool IsLocalMemory() const { return ((mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) || (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC)); diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h index 4940833557..583b13a124 100644 --- a/runtime/hsa-runtime/core/inc/memory_region.h +++ b/runtime/hsa-runtime/core/inc/memory_region.h @@ -47,8 +47,9 @@ #include -#include "core/inc/agent.h" +#include "core/inc/hsa_internal.h" #include "core/inc/checked.h" +#include "core/util/utils.h" namespace rocr { namespace core { @@ -106,6 +107,9 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { virtual hsa_status_t AssignAgent(void* ptr, size_t size, const Agent& agent, hsa_access_permission_t access) const = 0; + // Releases any cached memory that may be held within the allocator. + virtual void Trim() const {} + __forceinline bool fine_grain() const { return fine_grain_; } __forceinline bool full_profile() const { return full_profile_; } diff --git a/runtime/hsa-runtime/core/inc/scratch_cache.h b/runtime/hsa-runtime/core/inc/scratch_cache.h new file mode 100644 index 0000000000..5029f4781d --- /dev/null +++ b/runtime/hsa-runtime/core/inc/scratch_cache.h @@ -0,0 +1,191 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_ +#define HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_ + +#include "core/inc/amd_gpu_agent.h" +#include "core/util/locks.h" +#include "core/util/utils.h" + +#include +#include + +namespace rocr { +namespace AMD { + +class ScratchCache { + public: + struct node { + enum STATE { FREE = 0, ALLOC = 1, TRIM = 2, STEAL = 4 }; + void* base; + bool large; + uint32_t state; + + node() : base(nullptr), state(FREE) {} + + bool isFree() const { return state == FREE; } + bool trimPending() const { return state == (ALLOC | TRIM); } + + void trim() { + assert(!isFree() && "Trim of free scratch node."); + state |= TRIM; + } + void free() { + assert(!isFree() && "Free of free scratch node."); + state = FREE; + } + void alloc() { + assert(isFree() && "Alloc of non-free scratch node."); + state = ALLOC; + } + }; + + typedef ::std::multimap map_t; + typedef map_t::iterator ref_t; + typedef ::std::function deallocator_t; + + // @brief Contains scratch memory information. + struct ScratchInfo { + void* queue_base; + // Size to fill the machine with size_per_thread + size_t size; + // Size to satisfy the present dispatch without throttling. + size_t dispatch_size; + size_t size_per_thread; + uint32_t lanes_per_wave; + ptrdiff_t queue_process_offset; + bool large; + bool retry; + hsa_signal_t queue_retry; + uint64_t wanted_slots; + ScratchCache::ref_t scratch_node; + }; + + ScratchCache(const ScratchCache& rhs) = delete; + ScratchCache(ScratchCache&& rhs) = delete; + ScratchCache& operator=(const ScratchCache& rhs) = delete; + ScratchCache& operator=(ScratchCache&& rhs) = delete; + + ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {} + + ~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); } + + bool alloc(ScratchInfo& info) { + ref_t it = map.upper_bound(info.size - 1); + if (it == map.end()) return false; + + // Small requests must have an exact size match and be small. + if (!info.large) { + while ((it != map.end()) && (it->first == info.size)) { + if (it->second.isFree() && (!it->second.large)) { + it->second.alloc(); + info.queue_base = it->second.base; + info.scratch_node = it; + return true; + } + it++; + } + return false; + } + + // Large requests may use a small allocation and do not require an exact size match. + while (it != map.end()) { + if (it->second.isFree()) { + it->second.alloc(); + info.queue_base = it->second.base; + info.size = it->first; + info.scratch_node = it; + return true; + } + it++; + } + return false; + } + + void free(ScratchInfo& info) { + assert(!info.scratch_node->second.isFree() && "free called on free scratch node."); + auto it = info.scratch_node; + if (it->second.trimPending()) { + dealloc(it->second.base, it->first, it->second.large); + map.erase(it); + return; + } + it->second.free(); + } + + bool trim(bool trim_nodes_in_use) { + bool ret = !map.empty(); + auto it = map.begin(); + while (it != map.end()) { + if (it->second.isFree()) { + dealloc(it->second.base, it->first, it->second.large); + auto temp = it; + it++; + map.erase(temp); + } else { + if (trim_nodes_in_use) it->second.trim(); + it++; + } + } + return ret; + } + + void insert(ScratchInfo& info) { + node n; + n.base = info.queue_base; + n.large = info.large; + n.alloc(); + + auto it = map.insert(std::make_pair(info.size, n)); + info.scratch_node = it; + } + + private: + map_t map; + deallocator_t dealloc; +}; + +} // namespace AMD +} // namespace rocr + +#endif // header guard diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 3a4352fc70..03d5c9285f 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -792,11 +792,25 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { // Align whole waves to 1KB. scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave); scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave; -#ifndef NDEBUG - scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) * - pkt.dispatch.grid_size_z) / scratch.lanes_per_wave; + + uint64_t lanes_per_group = + (uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) * + pkt.dispatch.workgroup_size_z; + uint64_t waves_per_group = + (lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave; + uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) / + pkt.dispatch.workgroup_size_x) * + ((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) / + pkt.dispatch.workgroup_size_y) * + ((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) / + pkt.dispatch.workgroup_size_z); + + scratch.wanted_slots = groups * waves_per_group; scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots)); -#endif + scratch.wanted_slots = + Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks)); + scratch.dispatch_size = + scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave; queue->agent_->AcquireQueueScratch(scratch); @@ -1117,7 +1131,7 @@ void AqlQueue::InitScratchSRD() { queue_scratch_.size_per_thread) + 1023) / 1024); tmpring_size.bits.WAVESIZE = wave_scratch; assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow."); - uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024)); + uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024); tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves); amd_queue_.compute_tmpring_size = tmpring_size.u32All; return; diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 49f44e59b6..b4163a66e8 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -91,7 +91,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) memory_bus_width_(0), memory_max_frequency_(0), ape1_base_(0), - ape1_size_(0) { + ape1_size_(0), + scratch_cache_( + [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -171,6 +173,7 @@ GpuAgent::~GpuAgent() { _aligned_free(reinterpret_cast(ape1_base_)); } + scratch_cache_.trim(true); if (scratch_pool_.base() != NULL) { hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size()); } @@ -1007,7 +1010,8 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, scratch.size_per_thread = private_segment_size; const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU; - scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu; + scratch.size = + scratch.size_per_thread * properties_.MaxSlotsScratchCU * scratch.lanes_per_wave * num_cu; scratch.queue_base = nullptr; scratch.queue_process_offset = 0; @@ -1053,104 +1057,166 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024); if (size_per_wave > MAX_WAVE_SCRATCH) return; - ScopedAcquire lock(&scratch_lock_); - // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue. + /* + Determine size class needed. + + Scratch allocations come in two flavors based on how it is retired. Small allocations may be + kept bound to a queue and reused by firmware. This memory can not be reclaimed by the runtime + on demand so must be kept small to avoid egregious OOM conditions. Other allocations, aka large, + may be used by firmware only for one dispatch and are then surrendered to the runtime. This has + significant latency so we don't want to make all scratch allocations large (ie single use). + + Note that the designation "large" is for contrast with "small", which must really be small + amounts of memory, and does not always imply a large quantity of memory is needed. Other + properties of the allocation may require single use and so qualify the allocation or use as + "large". + + Here we decide on the boundaries for small scratch allocations. Both the largest small single + allocation and the maximum amount of memory bound by small allocations are limited. Additionally + some legacy devices do not support large scratch. + + For small scratch we must allocate enough memory for every physical scratch slot. + For large scratch compute the minimum memory needed to run the dispatch without limiting + occupancy. + Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single + allocation. + */ size_t small_limit = scratch_pool_.size() >> 3; // Lift limit for 2.10 release RCCL workaround. size_t single_limit = 146800640; //small_limit >> 2; + bool use_reclaim = true; bool large = (scratch.size > single_limit) || (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit); - large = (isa_->GetMajorVersion() < 8) ? false : large; - large = core::Runtime::runtime_singleton_->flag().no_scratch_reclaim() ? false : large; - if (large) - scratch.queue_base = scratch_pool_.alloc_high(scratch.size); - else - scratch.queue_base = scratch_pool_.alloc(scratch.size); - large |= scratch.queue_base > scratch_pool_.high_split(); - scratch.large = large; + if ((isa_->GetMajorVersion() < 8) || + core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) { + large = false; + use_reclaim = false; + } - scratch.queue_process_offset = - (need_queue_scratch_base) - ? uintptr_t(scratch.queue_base) - : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); + // If large is selected then the scratch will not be retained. + // In that case allocate the minimum necessary for the dispatch since we don't need all slots. + if (large) scratch.size = scratch.dispatch_size; - if (scratch.queue_base != nullptr) { - if (profile_ == HSA_PROFILE_FULL) return; - if (profile_ == HSA_PROFILE_BASE) { - HSAuint64 alternate_va; - if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) == - HSAKMT_STATUS_SUCCESS) { - if (large) scratch_used_large_ += scratch.size; - return; + // Ensure mapping will be in whole pages. + scratch.size = AlignUp(scratch.size, 4096); + + /* + Sequence of attempts is: + check cache + attempt a new allocation + trim unused blocks from cache + attempt a new allocation + check cache for sufficient used block, steal and wait (not implemented) + trim used blocks from cache, evaluate retry + reduce occupancy + */ + + // Lambda called in place. + // Used to allow exit from nested loops. + [&]() { + ScopedAcquire lock(&scratch_lock_); + // Check scratch cache + if (scratch_cache_.alloc(scratch)) return; + + // Attempt new allocation. + for (int i = 0; i < 2; i++) { + if (large) + scratch.queue_base = scratch_pool_.alloc_high(scratch.size); + else + scratch.queue_base = scratch_pool_.alloc(scratch.size); + scratch.large = large | (scratch.queue_base > scratch_pool_.high_split()); + assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled."); + + if (scratch.queue_base != nullptr) { + if (profile_ == HSA_PROFILE_FULL) return; + if (profile_ == HSA_PROFILE_BASE) { + HSAuint64 alternate_va; + if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) == + HSAKMT_STATUS_SUCCESS) { + if (scratch.large) scratch_used_large_ += scratch.size; + scratch_cache_.insert(scratch); + return; + } + } } + + // Scratch request failed allocation or mapping. + scratch_pool_.free(scratch.queue_base); + scratch.queue_base = nullptr; + + // Release cached scratch and retry. + // First iteration trims unused blocks, second trims all. + scratch_cache_.trim(i == 1); } - } - // Scratch request failed allocation or mapping. - scratch_pool_.free(scratch.queue_base); - scratch.queue_base = nullptr; - - // Retry if large may yield needed space. - if (scratch_used_large_ != 0) { - if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true; - return; - } - - // Fail scratch allocation if reducing occupancy is disabled. - if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return; - - // Attempt to trim the maximum number of concurrent waves to allow scratch to fit. - if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) - debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n", - scratch.size); - const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU; - const uint64_t total_waves = scratch.size / size_per_wave; - uint64_t waves_per_cu = total_waves / num_cus; - while (waves_per_cu != 0) { - size_t size = waves_per_cu * num_cus * size_per_wave; - void* base = scratch_pool_.alloc(size); - HSAuint64 alternate_va; - if ((base != nullptr) && - ((profile_ == HSA_PROFILE_FULL) || - (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) { - // Scratch allocated and either full profile or map succeeded. - scratch.queue_base = base; - scratch.size = size; - scratch.queue_process_offset = (need_queue_scratch_base) - ? uintptr_t(scratch.queue_base) - : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); - scratch.large = true; - scratch_used_large_ += scratch.size; - if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) - debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size, - float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f); + // Retry if large may yield needed space. + if (scratch_used_large_ != 0) { + if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true; return; } - scratch_pool_.free(base); - waves_per_cu--; - } - // Failed to allocate minimal scratch - assert(scratch.queue_base == nullptr && "bad scratch data"); - if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) - debug_print(" Could not allocate scratch for one wave per CU.\n"); + // Fail scratch allocation if reducing occupancy is disabled. + if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) + return; + + // Attempt to trim the maximum number of concurrent waves to allow scratch to fit. + if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) + debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n", + scratch.size); + const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU; + const uint64_t total_waves = scratch.size / size_per_wave; + uint64_t waves_per_cu = total_waves / num_cus; + while (waves_per_cu != 0) { + size_t size = waves_per_cu * num_cus * size_per_wave; + void* base = scratch_pool_.alloc_high(size); + HSAuint64 alternate_va; + if ((base != nullptr) && + ((profile_ == HSA_PROFILE_FULL) || + (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) { + // Scratch allocated and either full profile or map succeeded. + scratch.queue_base = base; + scratch.size = size; + scratch.large = true; + scratch_used_large_ += scratch.size; + scratch_cache_.insert(scratch); + if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) + debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size, + float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f); + return; + } + scratch_pool_.free(base); + waves_per_cu--; + } + + // Failed to allocate minimal scratch + assert(scratch.queue_base == nullptr && "bad scratch data"); + if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) + debug_print(" Could not allocate scratch for one wave per CU.\n"); + return; + }(); + + scratch.queue_process_offset = need_queue_scratch_base + ? uintptr_t(scratch.queue_base) + : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); } void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) { - if (scratch.queue_base == nullptr) { - return; - } + if (scratch.queue_base == nullptr) return; ScopedAcquire lock(&scratch_lock_); + scratch_cache_.free(scratch); + scratch.queue_base = nullptr; +} + +void GpuAgent::ReleaseScratch(void* base, size_t size, bool large) { if (profile_ == HSA_PROFILE_BASE) { - if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) { + if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) { assert(false && "Unmap scratch subrange failed!"); } } - scratch_pool_.free(scratch.queue_base); - scratch.queue_base = nullptr; + scratch_pool_.free(base); - if (scratch.large) scratch_used_large_ -= scratch.size; + if (large) scratch_used_large_ -= size; // Notify waiters that additional scratch may be available. for (auto notifier : scratch_notifiers_) { @@ -1444,5 +1510,11 @@ lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, return GetXgmiBlit(dst_agent); } +void GpuAgent::Trim() { + Agent::Trim(); + ScopedAcquire lock(&scratch_lock_); + scratch_cache_.trim(false); +} + } // namespace amd } // namespace rocr diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index 43581e0923..fca4398664 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -201,7 +201,7 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi // If it fails attempt to release memory from the block allocator and retry. *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); if (*address == nullptr) { - fragment_allocator_.trim(); + owner()->Trim(); *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); } @@ -699,6 +699,8 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size, return HSA_STATUS_SUCCESS; } +void MemoryRegion::Trim() const { fragment_allocator_.trim(); } + void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const { assert(request_size <= block_size() && "BlockAllocator alloc request exceeds block size.");