Cache scratch allocations.

Avoids calling to KFD to map/unmap scratch allocations for every large scratch using dispatch. Change-Id: I9fab5705251ec82b03e4f2f2ca6da7cdccabefb9
2020-10-07 06:41:19 -05:00
@@ -61,8 +61,6 @@ class Signal;
 typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source,
                                 void* data);

-class MemoryRegion;
-
 // Agent is intended to be an pure interface class and may be wrapped or
 // replaced by tools libraries. All funtions other than Convert, node_id,
 // device_type, and public_handle must be virtual.
@@ -260,6 +258,10 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
    return stat;
  }

+  virtual void Trim() {
+    for (auto region : regions()) region->Trim();
+  }
+
 protected:
  // Intention here is to have a polymorphic update procedure for public_handle_
  // which is callable on any Agent* but only from some class dervied from
@@ -55,6 +55,7 @@
 #include "core/inc/blit.h"
 #include "core/inc/signal.h"
 #include "core/inc/cache.h"
+#include "core/inc/scratch_cache.h"
 #include "core/util/small_heap.h"
 #include "core/util/locks.h"
 #include "core/util/lazy_ptr.h"
@@ -63,18 +64,7 @@ namespace rocr {
 namespace AMD {
 class MemoryRegion;

-// @brief Contains scratch memory information.
-struct ScratchInfo {
-  void* queue_base;
-  size_t size;
-  size_t size_per_thread;
-  uint32_t lanes_per_wave;
-  ptrdiff_t queue_process_offset;
-  bool large;
-  bool retry;
-  hsa_signal_t queue_retry;
-  uint64_t wanted_slots;
-};
+typedef ScratchCache::ScratchInfo ScratchInfo;

 // @brief Interface to represent a GPU agent.
 class GpuAgentInt : public core::Agent {
@@ -331,6 +321,8 @@ class GpuAgent : public GpuAgentInt {
    return memory_max_frequency_;
  }

+  void Trim() override;
+
 protected:
  static const uint32_t minAqlSize_ = 0x1000;   // 4KB min
  static const uint32_t maxAqlSize_ = 0x20000;  // 8MB max
@@ -494,6 +486,10 @@ class GpuAgent : public GpuAgentInt {
  // @brief Deregister scratch notification signals.
  void ClearScratchNotifiers() { scratch_notifiers_.clear(); }

+  // @brief Releases scratch back to the driver.
+  // caller must hold scratch_lock_.
+  void ReleaseScratch(void* base, size_t size, bool large);
+
  // Bind index of peer device that is connected via xGMI links
  lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);

@@ -517,6 +513,8 @@ class GpuAgent : public GpuAgentInt {
    KernelMutex lock_;
  } gws_queue_;

+  ScratchCache scratch_cache_;
+
  DISALLOW_COPY_AND_ASSIGN(GpuAgent);
 };

@@ -136,6 +136,8 @@ class MemoryRegion : public core::MemoryRegion {
  hsa_status_t AssignAgent(void* ptr, size_t size, const core::Agent& agent,
                           hsa_access_permission_t access) const;

+  void Trim() const;
+
  __forceinline bool IsLocalMemory() const {
    return ((mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) ||
            (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC));
@@ -47,8 +47,9 @@

 #include <vector>

-#include "core/inc/agent.h"
+#include "core/inc/hsa_internal.h"
 #include "core/inc/checked.h"
+#include "core/util/utils.h"

 namespace rocr {
 namespace core {
@@ -106,6 +107,9 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
  virtual hsa_status_t AssignAgent(void* ptr, size_t size, const Agent& agent,
                                   hsa_access_permission_t access) const = 0;

+  // Releases any cached memory that may be held within the allocator.
+  virtual void Trim() const {}
+
  __forceinline bool fine_grain() const { return fine_grain_; }

  __forceinline bool full_profile() const { return full_profile_; }
@@ -0,0 +1,191 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
+#define HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
+
+#include "core/inc/amd_gpu_agent.h"
+#include "core/util/locks.h"
+#include "core/util/utils.h"
+
+#include <map>
+#include <functional>
+
+namespace rocr {
+namespace AMD {
+
+class ScratchCache {
+ public:
+  struct node {
+    enum STATE { FREE = 0, ALLOC = 1, TRIM = 2, STEAL = 4 };
+    void* base;
+    bool large;
+    uint32_t state;
+
+    node() : base(nullptr), state(FREE) {}
+
+    bool isFree() const { return state == FREE; }
+    bool trimPending() const { return state == (ALLOC | TRIM); }
+
+    void trim() {
+      assert(!isFree() && "Trim of free scratch node.");
+      state |= TRIM;
+    }
+    void free() {
+      assert(!isFree() && "Free of free scratch node.");
+      state = FREE;
+    }
+    void alloc() {
+      assert(isFree() && "Alloc of non-free scratch node.");
+      state = ALLOC;
+    }
+  };
+
+  typedef ::std::multimap<size_t, node> map_t;
+  typedef map_t::iterator ref_t;
+  typedef ::std::function<void(void*, size_t, bool)> deallocator_t;
+
+  // @brief Contains scratch memory information.
+  struct ScratchInfo {
+    void* queue_base;
+    // Size to fill the machine with size_per_thread
+    size_t size;
+    // Size to satisfy the present dispatch without throttling.
+    size_t dispatch_size;
+    size_t size_per_thread;
+    uint32_t lanes_per_wave;
+    ptrdiff_t queue_process_offset;
+    bool large;
+    bool retry;
+    hsa_signal_t queue_retry;
+    uint64_t wanted_slots;
+    ScratchCache::ref_t scratch_node;
+  };
+
+  ScratchCache(const ScratchCache& rhs) = delete;
+  ScratchCache(ScratchCache&& rhs) = delete;
+  ScratchCache& operator=(const ScratchCache& rhs) = delete;
+  ScratchCache& operator=(ScratchCache&& rhs) = delete;
+
+  ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {}
+
+  ~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); }
+
+  bool alloc(ScratchInfo& info) {
+    ref_t it = map.upper_bound(info.size - 1);
+    if (it == map.end()) return false;
+
+    // Small requests must have an exact size match and be small.
+    if (!info.large) {
+      while ((it != map.end()) && (it->first == info.size)) {
+        if (it->second.isFree() && (!it->second.large)) {
+          it->second.alloc();
+          info.queue_base = it->second.base;
+          info.scratch_node = it;
+          return true;
+        }
+        it++;
+      }
+      return false;
+    }
+
+    // Large requests may use a small allocation and do not require an exact size match.
+    while (it != map.end()) {
+      if (it->second.isFree()) {
+        it->second.alloc();
+        info.queue_base = it->second.base;
+        info.size = it->first;
+        info.scratch_node = it;
+        return true;
+      }
+      it++;
+    }
+    return false;
+  }
+
+  void free(ScratchInfo& info) {
+    assert(!info.scratch_node->second.isFree() && "free called on free scratch node.");
+    auto it = info.scratch_node;
+    if (it->second.trimPending()) {
+      dealloc(it->second.base, it->first, it->second.large);
+      map.erase(it);
+      return;
+    }
+    it->second.free();
+  }
+
+  bool trim(bool trim_nodes_in_use) {
+    bool ret = !map.empty();
+    auto it = map.begin();
+    while (it != map.end()) {
+      if (it->second.isFree()) {
+        dealloc(it->second.base, it->first, it->second.large);
+        auto temp = it;
+        it++;
+        map.erase(temp);
+      } else {
+        if (trim_nodes_in_use) it->second.trim();
+        it++;
+      }
+    }
+    return ret;
+  }
+
+  void insert(ScratchInfo& info) {
+    node n;
+    n.base = info.queue_base;
+    n.large = info.large;
+    n.alloc();
+
+    auto it = map.insert(std::make_pair(info.size, n));
+    info.scratch_node = it;
+  }
+
+ private:
+  map_t map;
+  deallocator_t dealloc;
+};
+
+}  // namespace AMD
+}  // namespace rocr
+
+#endif  // header guard
@@ -792,11 +792,25 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
      // Align whole waves to 1KB.
      scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
      scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
-#ifndef NDEBUG
-      scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) *
-                              pkt.dispatch.grid_size_z) / scratch.lanes_per_wave;
+
+      uint64_t lanes_per_group =
+          (uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
+          pkt.dispatch.workgroup_size_z;
+      uint64_t waves_per_group =
+          (lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave;
+      uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
+                         pkt.dispatch.workgroup_size_x) *
+          ((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
+           pkt.dispatch.workgroup_size_y) *
+          ((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
+           pkt.dispatch.workgroup_size_z);
+
+      scratch.wanted_slots = groups * waves_per_group;
      scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
-#endif
+      scratch.wanted_slots =
+          Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks));
+      scratch.dispatch_size =
+          scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;

      queue->agent_->AcquireQueueScratch(scratch);

@@ -1117,7 +1131,7 @@ void AqlQueue::InitScratchSRD() {
                               queue_scratch_.size_per_thread) + 1023) / 1024);
  tmpring_size.bits.WAVESIZE = wave_scratch;
  assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
-  uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
+  uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024);
  tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
  amd_queue_.compute_tmpring_size = tmpring_size.u32All;
  return;
@@ -91,7 +91,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
      memory_bus_width_(0),
      memory_max_frequency_(0),
      ape1_base_(0),
-      ape1_size_(0) {
+      ape1_size_(0),
+      scratch_cache_(
+          [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
  const bool is_apu_node = (properties_.NumCPUCores > 0);
  profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;

@@ -171,6 +173,7 @@ GpuAgent::~GpuAgent() {
    _aligned_free(reinterpret_cast<void*>(ape1_base_));
  }

+  scratch_cache_.trim(true);
  if (scratch_pool_.base() != NULL) {
    hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size());
  }
@@ -1007,7 +1010,8 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
  scratch.size_per_thread = private_segment_size;

  const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-  scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
+  scratch.size =
+      scratch.size_per_thread * properties_.MaxSlotsScratchCU * scratch.lanes_per_wave * num_cu;
  scratch.queue_base = nullptr;
  scratch.queue_process_offset = 0;

@@ -1053,104 +1057,166 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
  uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024);
  if (size_per_wave > MAX_WAVE_SCRATCH) return;

-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-  // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
+  /*
+  Determine size class needed.
+
+  Scratch allocations come in two flavors based on how it is retired.  Small allocations may be
+  kept bound to a queue and reused by firmware.  This memory can not be reclaimed by the runtime
+  on demand so must be kept small to avoid egregious OOM conditions.  Other allocations, aka large,
+  may be used by firmware only for one dispatch and are then surrendered to the runtime.  This has
+  significant latency so we don't want to make all scratch allocations large (ie single use).
+
+  Note that the designation "large" is for contrast with "small", which must really be small
+  amounts of memory, and does not always imply a large quantity of memory is needed.  Other
+  properties of the allocation may require single use and so qualify the allocation or use as
+  "large".
+
+  Here we decide on the boundaries for small scratch allocations.  Both the largest small single
+  allocation and the maximum amount of memory bound by small allocations are limited.  Additionally
+  some legacy devices do not support large scratch.
+
+  For small scratch we must allocate enough memory for every physical scratch slot.
+  For large scratch compute the minimum memory needed to run the dispatch without limiting
+  occupancy.
+  Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single
+  allocation.
+  */
  size_t small_limit = scratch_pool_.size() >> 3;
  // Lift limit for 2.10 release RCCL workaround.
  size_t single_limit = 146800640; //small_limit >> 2;
+  bool use_reclaim = true;
  bool large = (scratch.size > single_limit) ||
      (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
-  large = (isa_->GetMajorVersion() < 8) ? false : large;
-  large = core::Runtime::runtime_singleton_->flag().no_scratch_reclaim() ? false : large;
-  if (large)
-    scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
-  else
-    scratch.queue_base = scratch_pool_.alloc(scratch.size);
-  large |= scratch.queue_base > scratch_pool_.high_split();
-  scratch.large = large;
+  if ((isa_->GetMajorVersion() < 8) ||
+      core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) {
+    large = false;
+    use_reclaim = false;
+  }

-  scratch.queue_process_offset =
-      (need_queue_scratch_base)
-          ? uintptr_t(scratch.queue_base)
-          : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
+  // If large is selected then the scratch will not be retained.
+  // In that case allocate the minimum necessary for the dispatch since we don't need all slots.
+  if (large) scratch.size = scratch.dispatch_size;

-  if (scratch.queue_base != nullptr) {
-    if (profile_ == HSA_PROFILE_FULL) return;
-    if (profile_ == HSA_PROFILE_BASE) {
-      HSAuint64 alternate_va;
-      if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
-          HSAKMT_STATUS_SUCCESS) {
-        if (large) scratch_used_large_ += scratch.size;
-        return;
+  // Ensure mapping will be in whole pages.
+  scratch.size = AlignUp(scratch.size, 4096);
+
+  /*
+  Sequence of attempts is:
+    check cache
+    attempt a new allocation
+    trim unused blocks from cache
+    attempt a new allocation
+    check cache for sufficient used block, steal and wait (not implemented)
+    trim used blocks from cache, evaluate retry
+    reduce occupancy
+  */
+
+  // Lambda called in place.
+  // Used to allow exit from nested loops.
+  [&]() {
+    ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+    // Check scratch cache
+    if (scratch_cache_.alloc(scratch)) return;
+
+    // Attempt new allocation.
+    for (int i = 0; i < 2; i++) {
+      if (large)
+        scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
+      else
+        scratch.queue_base = scratch_pool_.alloc(scratch.size);
+      scratch.large = large | (scratch.queue_base > scratch_pool_.high_split());
+      assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled.");
+
+      if (scratch.queue_base != nullptr) {
+        if (profile_ == HSA_PROFILE_FULL) return;
+        if (profile_ == HSA_PROFILE_BASE) {
+          HSAuint64 alternate_va;
+          if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
+              HSAKMT_STATUS_SUCCESS) {
+            if (scratch.large) scratch_used_large_ += scratch.size;
+            scratch_cache_.insert(scratch);
+            return;
+          }
+        }
      }
+
+      // Scratch request failed allocation or mapping.
+      scratch_pool_.free(scratch.queue_base);
+      scratch.queue_base = nullptr;
+
+      // Release cached scratch and retry.
+      // First iteration trims unused blocks, second trims all.
+      scratch_cache_.trim(i == 1);
    }
-  }

-  // Scratch request failed allocation or mapping.
-  scratch_pool_.free(scratch.queue_base);
-  scratch.queue_base = nullptr;
-
-  // Retry if large may yield needed space.
-  if (scratch_used_large_ != 0) {
-    if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
-    return;
-  }
-
-  // Fail scratch allocation if reducing occupancy is disabled.
-  if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return;
-
-  // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
-  if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-    debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
-                scratch.size);
-  const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-  const uint64_t total_waves = scratch.size / size_per_wave;
-  uint64_t waves_per_cu = total_waves / num_cus;
-  while (waves_per_cu != 0) {
-    size_t size = waves_per_cu * num_cus * size_per_wave;
-    void* base = scratch_pool_.alloc(size);
-    HSAuint64 alternate_va;
-    if ((base != nullptr) &&
-        ((profile_ == HSA_PROFILE_FULL) ||
-         (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
-      // Scratch allocated and either full profile or map succeeded.
-      scratch.queue_base = base;
-      scratch.size = size;
-      scratch.queue_process_offset = (need_queue_scratch_base)
-          ? uintptr_t(scratch.queue_base)
-          : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
-      scratch.large = true;
-      scratch_used_large_ += scratch.size;
-      if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-        debug_print("  %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
-                    float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
+    // Retry if large may yield needed space.
+    if (scratch_used_large_ != 0) {
+      if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
      return;
    }
-    scratch_pool_.free(base);
-    waves_per_cu--;
-  }

-  // Failed to allocate minimal scratch
-  assert(scratch.queue_base == nullptr && "bad scratch data");
-  if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-    debug_print("  Could not allocate scratch for one wave per CU.\n");
+    // Fail scratch allocation if reducing occupancy is disabled.
+    if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
+      return;
+
+    // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
+    if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
+      debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
+                  scratch.size);
+    const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
+    const uint64_t total_waves = scratch.size / size_per_wave;
+    uint64_t waves_per_cu = total_waves / num_cus;
+    while (waves_per_cu != 0) {
+      size_t size = waves_per_cu * num_cus * size_per_wave;
+      void* base = scratch_pool_.alloc_high(size);
+      HSAuint64 alternate_va;
+      if ((base != nullptr) &&
+          ((profile_ == HSA_PROFILE_FULL) ||
+           (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
+        // Scratch allocated and either full profile or map succeeded.
+        scratch.queue_base = base;
+        scratch.size = size;
+        scratch.large = true;
+        scratch_used_large_ += scratch.size;
+        scratch_cache_.insert(scratch);
+        if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
+          debug_print("  %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
+                      float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
+        return;
+      }
+      scratch_pool_.free(base);
+      waves_per_cu--;
+    }
+
+    // Failed to allocate minimal scratch
+    assert(scratch.queue_base == nullptr && "bad scratch data");
+    if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
+      debug_print("  Could not allocate scratch for one wave per CU.\n");
+    return;
+  }();
+
+  scratch.queue_process_offset = need_queue_scratch_base
+      ? uintptr_t(scratch.queue_base)
+      : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
 }

 void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
-  if (scratch.queue_base == nullptr) {
-    return;
-  }
+  if (scratch.queue_base == nullptr) return;

  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  scratch_cache_.free(scratch);
+  scratch.queue_base = nullptr;
+}
+
+void GpuAgent::ReleaseScratch(void* base, size_t size, bool large) {
  if (profile_ == HSA_PROFILE_BASE) {
-    if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) {
+    if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) {
      assert(false && "Unmap scratch subrange failed!");
    }
  }
-  scratch_pool_.free(scratch.queue_base);
-  scratch.queue_base = nullptr;
+  scratch_pool_.free(base);

-  if (scratch.large) scratch_used_large_ -= scratch.size;
+  if (large) scratch_used_large_ -= size;

  // Notify waiters that additional scratch may be available.
  for (auto notifier : scratch_notifiers_) {
@@ -1444,5 +1510,11 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
  return GetXgmiBlit(dst_agent);
 }

+void GpuAgent::Trim() {
+  Agent::Trim();
+  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  scratch_cache_.trim(false);
+}
+
 }  // namespace amd
 }  // namespace rocr
@@ -201,7 +201,7 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi
  // If it fails attempt to release memory from the block allocator and retry.
  *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
  if (*address == nullptr) {
-    fragment_allocator_.trim();
+    owner()->Trim();
    *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
  }

@@ -699,6 +699,8 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size,
  return HSA_STATUS_SUCCESS;
 }

+void MemoryRegion::Trim() const { fragment_allocator_.trim(); }
+
 void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
  assert(request_size <= block_size() && "BlockAllocator alloc request exceeds block size.");