diff --git a/runtime/hsa-runtime/core/inc/agent.h b/runtime/hsa-runtime/core/inc/agent.h
index 08683934cf..c76064ac00 100644
--- a/runtime/hsa-runtime/core/inc/agent.h
+++ b/runtime/hsa-runtime/core/inc/agent.h
@@ -61,6 +61,8 @@ class Signal;
 typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source,
                                  void* data);
 
+class MemoryRegion;
+
 // Agent is intended to be an pure interface class and may be wrapped or
 // replaced by tools libraries. All funtions other than Convert, node_id,
 // device_type, and public_handle must be virtual.
@@ -258,10 +260,6 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
     return stat;
   }
 
-  virtual void Trim() {
-    for (auto region : regions()) region->Trim();
-  }
-
  protected:
   // Intention here is to have a polymorphic update procedure for public_handle_
   // which is callable on any Agent* but only from some class dervied from
diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index c00005ff0f..b86f706d75 100644
--- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -55,7 +55,6 @@
 #include "core/inc/blit.h"
 #include "core/inc/signal.h"
 #include "core/inc/cache.h"
-#include "core/inc/scratch_cache.h"
 #include "core/util/small_heap.h"
 #include "core/util/locks.h"
 #include "core/util/lazy_ptr.h"
@@ -64,7 +63,18 @@ namespace rocr {
 namespace AMD {
 class MemoryRegion;
 
-typedef ScratchCache::ScratchInfo ScratchInfo;
+// @brief Contains scratch memory information.
+struct ScratchInfo {
+  void* queue_base;
+  size_t size;
+  size_t size_per_thread;
+  uint32_t lanes_per_wave;
+  ptrdiff_t queue_process_offset;
+  bool large;
+  bool retry;
+  hsa_signal_t queue_retry;
+  uint64_t wanted_slots;
+};
 
 // @brief Interface to represent a GPU agent.
 class GpuAgentInt : public core::Agent {
@@ -321,8 +331,6 @@ class GpuAgent : public GpuAgentInt {
     return memory_max_frequency_;
   }
 
-  void Trim() override;
-
  protected:
   static const uint32_t minAqlSize_ = 0x1000;   // 4KB min
   static const uint32_t maxAqlSize_ = 0x20000;  // 8MB max
@@ -486,10 +494,6 @@ class GpuAgent : public GpuAgentInt {
   // @brief Deregister scratch notification signals.
   void ClearScratchNotifiers() { scratch_notifiers_.clear(); }
 
-  // @brief Releases scratch back to the driver.
-  // caller must hold scratch_lock_.
-  void ReleaseScratch(void* base, size_t size, bool large);
-
   // Bind index of peer device that is connected via xGMI links
   lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
 
@@ -513,8 +517,6 @@ class GpuAgent : public GpuAgentInt {
     KernelMutex lock_;
   } gws_queue_;
 
-  ScratchCache scratch_cache_;
-
   DISALLOW_COPY_AND_ASSIGN(GpuAgent);
 };
 
diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h
index e119a939eb..f5342610fd 100644
--- a/runtime/hsa-runtime/core/inc/amd_memory_region.h
+++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h
@@ -136,8 +136,6 @@ class MemoryRegion : public core::MemoryRegion {
   hsa_status_t AssignAgent(void* ptr, size_t size, const core::Agent& agent,
                            hsa_access_permission_t access) const;
 
-  void Trim() const;
-
   __forceinline bool IsLocalMemory() const {
     return ((mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) ||
             (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC));
diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h
index 583b13a124..4940833557 100644
--- a/runtime/hsa-runtime/core/inc/memory_region.h
+++ b/runtime/hsa-runtime/core/inc/memory_region.h
@@ -47,9 +47,8 @@
 
 #include <vector>
 
-#include "core/inc/hsa_internal.h"
+#include "core/inc/agent.h"
 #include "core/inc/checked.h"
-#include "core/util/utils.h"
 
 namespace rocr {
 namespace core {
@@ -107,9 +106,6 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
   virtual hsa_status_t AssignAgent(void* ptr, size_t size, const Agent& agent,
                                    hsa_access_permission_t access) const = 0;
 
-  // Releases any cached memory that may be held within the allocator.
-  virtual void Trim() const {}
-
   __forceinline bool fine_grain() const { return fine_grain_; }
 
   __forceinline bool full_profile() const { return full_profile_; }
diff --git a/runtime/hsa-runtime/core/inc/scratch_cache.h b/runtime/hsa-runtime/core/inc/scratch_cache.h
deleted file mode 100644
index 5029f4781d..0000000000
--- a/runtime/hsa-runtime/core/inc/scratch_cache.h
+++ /dev/null
@@ -1,191 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-//
-// Copyright (c) 2020-2020, Advanced Micro Devices, Inc. All rights reserved.
-//
-// Developed by:
-//
-//                 AMD Research and AMD HSA Software Development
-//
-//                 Advanced Micro Devices, Inc.
-//
-//                 www.amd.com
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
-#define HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
-
-#include "core/inc/amd_gpu_agent.h"
-#include "core/util/locks.h"
-#include "core/util/utils.h"
-
-#include <map>
-#include <functional>
-
-namespace rocr {
-namespace AMD {
-
-class ScratchCache {
- public:
-  struct node {
-    enum STATE { FREE = 0, ALLOC = 1, TRIM = 2, STEAL = 4 };
-    void* base;
-    bool large;
-    uint32_t state;
-
-    node() : base(nullptr), state(FREE) {}
-
-    bool isFree() const { return state == FREE; }
-    bool trimPending() const { return state == (ALLOC | TRIM); }
-
-    void trim() {
-      assert(!isFree() && "Trim of free scratch node.");
-      state |= TRIM;
-    }
-    void free() {
-      assert(!isFree() && "Free of free scratch node.");
-      state = FREE;
-    }
-    void alloc() {
-      assert(isFree() && "Alloc of non-free scratch node.");
-      state = ALLOC;
-    }
-  };
-
-  typedef ::std::multimap<size_t, node> map_t;
-  typedef map_t::iterator ref_t;
-  typedef ::std::function<void(void*, size_t, bool)> deallocator_t;
-
-  // @brief Contains scratch memory information.
-  struct ScratchInfo {
-    void* queue_base;
-    // Size to fill the machine with size_per_thread
-    size_t size;
-    // Size to satisfy the present dispatch without throttling.
-    size_t dispatch_size;
-    size_t size_per_thread;
-    uint32_t lanes_per_wave;
-    ptrdiff_t queue_process_offset;
-    bool large;
-    bool retry;
-    hsa_signal_t queue_retry;
-    uint64_t wanted_slots;
-    ScratchCache::ref_t scratch_node;
-  };
-
-  ScratchCache(const ScratchCache& rhs) = delete;
-  ScratchCache(ScratchCache&& rhs) = delete;
-  ScratchCache& operator=(const ScratchCache& rhs) = delete;
-  ScratchCache& operator=(ScratchCache&& rhs) = delete;
-
-  ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {}
-
-  ~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); }
-
-  bool alloc(ScratchInfo& info) {
-    ref_t it = map.upper_bound(info.size - 1);
-    if (it == map.end()) return false;
-
-    // Small requests must have an exact size match and be small.
-    if (!info.large) {
-      while ((it != map.end()) && (it->first == info.size)) {
-        if (it->second.isFree() && (!it->second.large)) {
-          it->second.alloc();
-          info.queue_base = it->second.base;
-          info.scratch_node = it;
-          return true;
-        }
-        it++;
-      }
-      return false;
-    }
-
-    // Large requests may use a small allocation and do not require an exact size match.
-    while (it != map.end()) {
-      if (it->second.isFree()) {
-        it->second.alloc();
-        info.queue_base = it->second.base;
-        info.size = it->first;
-        info.scratch_node = it;
-        return true;
-      }
-      it++;
-    }
-    return false;
-  }
-
-  void free(ScratchInfo& info) {
-    assert(!info.scratch_node->second.isFree() && "free called on free scratch node.");
-    auto it = info.scratch_node;
-    if (it->second.trimPending()) {
-      dealloc(it->second.base, it->first, it->second.large);
-      map.erase(it);
-      return;
-    }
-    it->second.free();
-  }
-
-  bool trim(bool trim_nodes_in_use) {
-    bool ret = !map.empty();
-    auto it = map.begin();
-    while (it != map.end()) {
-      if (it->second.isFree()) {
-        dealloc(it->second.base, it->first, it->second.large);
-        auto temp = it;
-        it++;
-        map.erase(temp);
-      } else {
-        if (trim_nodes_in_use) it->second.trim();
-        it++;
-      }
-    }
-    return ret;
-  }
-
-  void insert(ScratchInfo& info) {
-    node n;
-    n.base = info.queue_base;
-    n.large = info.large;
-    n.alloc();
-
-    auto it = map.insert(std::make_pair(info.size, n));
-    info.scratch_node = it;
-  }
-
- private:
-  map_t map;
-  deallocator_t dealloc;
-};
-
-}  // namespace AMD
-}  // namespace rocr
-
-#endif  // header guard
diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index 03d5c9285f..3a4352fc70 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -792,25 +792,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       // Align whole waves to 1KB.
       scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
       scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
-
-      uint64_t lanes_per_group =
-          (uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
-          pkt.dispatch.workgroup_size_z;
-      uint64_t waves_per_group =
-          (lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave;
-      uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
-                         pkt.dispatch.workgroup_size_x) *
-          ((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
-           pkt.dispatch.workgroup_size_y) *
-          ((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
-           pkt.dispatch.workgroup_size_z);
-
-      scratch.wanted_slots = groups * waves_per_group;
+#ifndef NDEBUG
+      scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) *
+                              pkt.dispatch.grid_size_z) / scratch.lanes_per_wave;
       scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
-      scratch.wanted_slots =
-          Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks));
-      scratch.dispatch_size =
-          scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;
+#endif
 
       queue->agent_->AcquireQueueScratch(scratch);
 
@@ -1131,7 +1117,7 @@ void AqlQueue::InitScratchSRD() {
                                queue_scratch_.size_per_thread) + 1023) / 1024);
   tmpring_size.bits.WAVESIZE = wave_scratch;
   assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
-  uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024);
+  uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
   tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
   amd_queue_.compute_tmpring_size = tmpring_size.u32All;
   return;
diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index b4163a66e8..49f44e59b6 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -91,9 +91,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
       memory_bus_width_(0),
       memory_max_frequency_(0),
       ape1_base_(0),
-      ape1_size_(0),
-      scratch_cache_(
-          [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
+      ape1_size_(0) {
   const bool is_apu_node = (properties_.NumCPUCores > 0);
   profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
 
@@ -173,7 +171,6 @@ GpuAgent::~GpuAgent() {
     _aligned_free(reinterpret_cast<void*>(ape1_base_));
   }
 
-  scratch_cache_.trim(true);
   if (scratch_pool_.base() != NULL) {
     hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size());
   }
@@ -1010,8 +1007,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
   scratch.size_per_thread = private_segment_size;
 
   const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-  scratch.size =
-      scratch.size_per_thread * properties_.MaxSlotsScratchCU * scratch.lanes_per_wave * num_cu;
+  scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
   scratch.queue_base = nullptr;
   scratch.queue_process_offset = 0;
 
@@ -1057,166 +1053,104 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
   uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024);
   if (size_per_wave > MAX_WAVE_SCRATCH) return;
 
-  /*
-  Determine size class needed.
-
-  Scratch allocations come in two flavors based on how it is retired.  Small allocations may be
-  kept bound to a queue and reused by firmware.  This memory can not be reclaimed by the runtime
-  on demand so must be kept small to avoid egregious OOM conditions.  Other allocations, aka large,
-  may be used by firmware only for one dispatch and are then surrendered to the runtime.  This has
-  significant latency so we don't want to make all scratch allocations large (ie single use).
-
-  Note that the designation "large" is for contrast with "small", which must really be small
-  amounts of memory, and does not always imply a large quantity of memory is needed.  Other
-  properties of the allocation may require single use and so qualify the allocation or use as
-  "large".
-
-  Here we decide on the boundaries for small scratch allocations.  Both the largest small single
-  allocation and the maximum amount of memory bound by small allocations are limited.  Additionally
-  some legacy devices do not support large scratch.
-
-  For small scratch we must allocate enough memory for every physical scratch slot.
-  For large scratch compute the minimum memory needed to run the dispatch without limiting
-  occupancy.
-  Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single
-  allocation.
-  */
+  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
   size_t small_limit = scratch_pool_.size() >> 3;
   // Lift limit for 2.10 release RCCL workaround.
   size_t single_limit = 146800640; //small_limit >> 2;
-  bool use_reclaim = true;
   bool large = (scratch.size > single_limit) ||
       (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
-  if ((isa_->GetMajorVersion() < 8) ||
-      core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) {
-    large = false;
-    use_reclaim = false;
-  }
+  large = (isa_->GetMajorVersion() < 8) ? false : large;
+  large = core::Runtime::runtime_singleton_->flag().no_scratch_reclaim() ? false : large;
+  if (large)
+    scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
+  else
+    scratch.queue_base = scratch_pool_.alloc(scratch.size);
+  large |= scratch.queue_base > scratch_pool_.high_split();
+  scratch.large = large;
 
-  // If large is selected then the scratch will not be retained.
-  // In that case allocate the minimum necessary for the dispatch since we don't need all slots.
-  if (large) scratch.size = scratch.dispatch_size;
+  scratch.queue_process_offset =
+      (need_queue_scratch_base)
+          ? uintptr_t(scratch.queue_base)
+          : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
 
-  // Ensure mapping will be in whole pages.
-  scratch.size = AlignUp(scratch.size, 4096);
-
-  /*
-  Sequence of attempts is:
-    check cache
-    attempt a new allocation
-    trim unused blocks from cache
-    attempt a new allocation
-    check cache for sufficient used block, steal and wait (not implemented)
-    trim used blocks from cache, evaluate retry
-    reduce occupancy
-  */
-
-  // Lambda called in place.
-  // Used to allow exit from nested loops.
-  [&]() {
-    ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-    // Check scratch cache
-    if (scratch_cache_.alloc(scratch)) return;
-
-    // Attempt new allocation.
-    for (int i = 0; i < 2; i++) {
-      if (large)
-        scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
-      else
-        scratch.queue_base = scratch_pool_.alloc(scratch.size);
-      scratch.large = large | (scratch.queue_base > scratch_pool_.high_split());
-      assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled.");
-
-      if (scratch.queue_base != nullptr) {
-        if (profile_ == HSA_PROFILE_FULL) return;
-        if (profile_ == HSA_PROFILE_BASE) {
-          HSAuint64 alternate_va;
-          if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
-              HSAKMT_STATUS_SUCCESS) {
-            if (scratch.large) scratch_used_large_ += scratch.size;
-            scratch_cache_.insert(scratch);
-            return;
-          }
-        }
-      }
-
-      // Scratch request failed allocation or mapping.
-      scratch_pool_.free(scratch.queue_base);
-      scratch.queue_base = nullptr;
-
-      // Release cached scratch and retry.
-      // First iteration trims unused blocks, second trims all.
-      scratch_cache_.trim(i == 1);
-    }
-
-    // Retry if large may yield needed space.
-    if (scratch_used_large_ != 0) {
-      if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
-      return;
-    }
-
-    // Fail scratch allocation if reducing occupancy is disabled.
-    if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
-      return;
-
-    // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
-    if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-      debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
-                  scratch.size);
-    const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-    const uint64_t total_waves = scratch.size / size_per_wave;
-    uint64_t waves_per_cu = total_waves / num_cus;
-    while (waves_per_cu != 0) {
-      size_t size = waves_per_cu * num_cus * size_per_wave;
-      void* base = scratch_pool_.alloc_high(size);
+  if (scratch.queue_base != nullptr) {
+    if (profile_ == HSA_PROFILE_FULL) return;
+    if (profile_ == HSA_PROFILE_BASE) {
       HSAuint64 alternate_va;
-      if ((base != nullptr) &&
-          ((profile_ == HSA_PROFILE_FULL) ||
-           (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
-        // Scratch allocated and either full profile or map succeeded.
-        scratch.queue_base = base;
-        scratch.size = size;
-        scratch.large = true;
-        scratch_used_large_ += scratch.size;
-        scratch_cache_.insert(scratch);
-        if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-          debug_print("  %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
-                      float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
+      if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
+          HSAKMT_STATUS_SUCCESS) {
+        if (large) scratch_used_large_ += scratch.size;
         return;
       }
-      scratch_pool_.free(base);
-      waves_per_cu--;
     }
+  }
 
-    // Failed to allocate minimal scratch
-    assert(scratch.queue_base == nullptr && "bad scratch data");
-    if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-      debug_print("  Could not allocate scratch for one wave per CU.\n");
+  // Scratch request failed allocation or mapping.
+  scratch_pool_.free(scratch.queue_base);
+  scratch.queue_base = nullptr;
+
+  // Retry if large may yield needed space.
+  if (scratch_used_large_ != 0) {
+    if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
     return;
-  }();
+  }
 
-  scratch.queue_process_offset = need_queue_scratch_base
-      ? uintptr_t(scratch.queue_base)
-      : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
+  // Fail scratch allocation if reducing occupancy is disabled.
+  if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return;
+
+  // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
+  if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
+    debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
+                scratch.size);
+  const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
+  const uint64_t total_waves = scratch.size / size_per_wave;
+  uint64_t waves_per_cu = total_waves / num_cus;
+  while (waves_per_cu != 0) {
+    size_t size = waves_per_cu * num_cus * size_per_wave;
+    void* base = scratch_pool_.alloc(size);
+    HSAuint64 alternate_va;
+    if ((base != nullptr) &&
+        ((profile_ == HSA_PROFILE_FULL) ||
+         (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
+      // Scratch allocated and either full profile or map succeeded.
+      scratch.queue_base = base;
+      scratch.size = size;
+      scratch.queue_process_offset = (need_queue_scratch_base)
+          ? uintptr_t(scratch.queue_base)
+          : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
+      scratch.large = true;
+      scratch_used_large_ += scratch.size;
+      if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
+        debug_print("  %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
+                    float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
+      return;
+    }
+    scratch_pool_.free(base);
+    waves_per_cu--;
+  }
+
+  // Failed to allocate minimal scratch
+  assert(scratch.queue_base == nullptr && "bad scratch data");
+  if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
+    debug_print("  Could not allocate scratch for one wave per CU.\n");
 }
 
 void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
-  if (scratch.queue_base == nullptr) return;
+  if (scratch.queue_base == nullptr) {
+    return;
+  }
 
   ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-  scratch_cache_.free(scratch);
-  scratch.queue_base = nullptr;
-}
-
-void GpuAgent::ReleaseScratch(void* base, size_t size, bool large) {
   if (profile_ == HSA_PROFILE_BASE) {
-    if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) {
+    if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) {
       assert(false && "Unmap scratch subrange failed!");
     }
   }
-  scratch_pool_.free(base);
+  scratch_pool_.free(scratch.queue_base);
+  scratch.queue_base = nullptr;
 
-  if (large) scratch_used_large_ -= size;
+  if (scratch.large) scratch_used_large_ -= scratch.size;
 
   // Notify waiters that additional scratch may be available.
   for (auto notifier : scratch_notifiers_) {
@@ -1510,11 +1444,5 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
   return GetXgmiBlit(dst_agent);
 }
 
-void GpuAgent::Trim() {
-  Agent::Trim();
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-  scratch_cache_.trim(false);
-}
-
 }  // namespace amd
 }  // namespace rocr
diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
index fca4398664..43581e0923 100644
--- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
@@ -201,7 +201,7 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi
   // If it fails attempt to release memory from the block allocator and retry.
   *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
   if (*address == nullptr) {
-    owner()->Trim();
+    fragment_allocator_.trim();
     *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
   }
 
@@ -699,8 +699,6 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size,
   return HSA_STATUS_SUCCESS;
 }
 
-void MemoryRegion::Trim() const { fragment_allocator_.trim(); }
-
 void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
   assert(request_size <= block_size() && "BlockAllocator alloc request exceeds block size.");