From 637b0d71f0ea7da409d7126b5828cc1982f02d92 Mon Sep 17 00:00:00 2001
From: pghoshamd <Prerona.Ghosh@amd.com>
Date: Tue, 6 Jan 2026 10:59:34 -0500
Subject: [PATCH] SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers
 (#2146)

* SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers

* Remove KernelMutex and KernelSharedMutex abstractions with std::mutex and std::shared_mutex

* Replaced unique_locks with lock_guards

* More changes

* Replace new and deletes with smart pointers

* Replaced some more with shared ptrs

* Replacements with smart pointers - pt 2

* missed change
---
 .../runtime/hsa-runtime/core/inc/agent.h      |   7 +-
 .../hsa-runtime/core/inc/amd_aie_agent.h      |   4 +-
 .../hsa-runtime/core/inc/amd_aql_queue.h      |  10 +-
 .../hsa-runtime/core/inc/amd_blit_sdma.h      |   2 +-
 .../hsa-runtime/core/inc/amd_cpu_agent.h      |   6 +-
 .../hsa-runtime/core/inc/amd_gpu_agent.h      |  22 +--
 .../hsa-runtime/core/inc/amd_hsa_code.hpp     |  10 +-
 .../hsa-runtime/core/inc/amd_hsa_loader.hpp   |   2 +-
 .../hsa-runtime/core/inc/amd_memory_region.h  |   2 +-
 .../hsa-runtime/core/inc/intercept_queue.h    |   2 +-
 .../runtime/hsa-runtime/core/inc/ipc_signal.h |   2 +-
 .../runtime/hsa-runtime/core/inc/runtime.h    |  40 +++--
 .../runtime/hsa-runtime/core/inc/signal.h     |   3 +-
 .../core/runtime/amd_aie_agent.cpp            |  22 +--
 .../core/runtime/amd_aql_queue.cpp            |  22 +--
 .../core/runtime/amd_blit_sdma.cpp            |   4 +-
 .../core/runtime/amd_cpu_agent.cpp            |  15 +-
 .../core/runtime/amd_gpu_agent.cpp            |  83 +++++-----
 .../core/runtime/amd_loader_context.cpp       |  10 +-
 .../core/runtime/amd_memory_region.cpp        |  13 +-
 .../runtime/hsa-runtime/core/runtime/hsa.cpp  |  10 +-
 .../hsa-runtime/core/runtime/hsa_ext_amd.cpp  |   4 +-
 .../core/runtime/intercept_queue.cpp          |   2 +-
 .../core/runtime/interrupt_signal.cpp         |   4 +-
 .../hsa-runtime/core/runtime/ipc_signal.cpp   |   4 +-
 .../hsa-runtime/core/runtime/runtime.cpp      | 119 +++++++-------
 .../hsa-runtime/core/runtime/signal.cpp       |  14 +-
 .../runtime/hsa-runtime/core/util/lazy_ptr.h  |   8 +-
 .../runtime/hsa-runtime/core/util/locks.h     | 149 ++----------------
 .../libamdhsacode/amd_hsa_code.cpp            |  48 +++---
 .../runtime/hsa-runtime/loader/executable.cpp | 104 ++++++------
 .../runtime/hsa-runtime/loader/executable.hpp |  12 +-
 .../runtime/hsa-runtime/pcs/pcs_runtime.cpp   |  10 +-
 .../runtime/hsa-runtime/pcs/pcs_runtime.h     |   2 +-
 34 files changed, 319 insertions(+), 452 deletions(-)
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h
index fb8f89c118..ee7ec26a8b 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h
@@ -47,6 +47,7 @@
 
 #include <assert.h>
 #include <vector>
+#include <mutex>
 
 #include "core/inc/checked.h"
 #include "core/inc/isa.h"
@@ -291,7 +292,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
                                void* value) const = 0;
 
   // @brief Returns an array of regions owned by the agent.
-  virtual const std::vector<const core::MemoryRegion*>& regions() const = 0;
+  virtual const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const = 0;
 
   // @brief Returns the ISA's supported by the agent.
   // @details The returned vector is a list of pointers to the supported ISA,
@@ -336,7 +337,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
   __forceinline void Disable() { enabled_ = false; }
 
   virtual void Trim() {
-    for (auto region : regions()) region->Trim();
+    for (const auto& region : regions()) region.get()->Trim();
   }
 
   virtual void ReleaseResources() { }
@@ -385,7 +386,7 @@ protected:
   // Serial memory operations are needed to ensure, among other things, that allocation failures are
   // due to true OOM conditions and per region caching (Trim and Allocate must be serial and
   // exclusive to ensure this).
-  KernelMutex agent_memory_lock_;
+  std::mutex agent_memory_lock_;
 
   // Forbid copying and moving of this object
   DISALLOW_COPY_AND_ASSIGN(Agent);
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h
index a9c77fa18d..c5dd4ccb19 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h
@@ -82,7 +82,7 @@ public:
  /// @brief Override from core::Agent.
  const std::vector<const core::Isa*>& supported_isas() const override { return supported_isas_; }
 
- const std::vector<const core::MemoryRegion*>& regions() const override { return regions_; }
+ const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override { return regions_; }
 
  /// @brief Getter for the AIE system allocator.
  const std::function<void*(size_t size, size_t align, core::MemoryRegion::AllocateFlags flags)>&
@@ -101,7 +101,7 @@ private:
   /// @brief Setup the memory allocators used by this agent.
   void InitAllocators();
 
-  std::vector<const core::MemoryRegion *> regions_;
+  std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
   std::function<void *(size_t size, size_t align,
                        core::MemoryRegion::AllocateFlags flags)>
       system_allocator_;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h
index 1f0bfa1215..ef80f69776 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h
@@ -306,7 +306,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   // GPU-visible indirect buffer holding PM4 commands.
   void* pm4_ib_buf_;
   uint32_t pm4_ib_size_b_;
-  KernelMutex pm4_ib_mutex_;
+  std::mutex pm4_ib_mutex_;
 
   // Error handler control variable.
   std::atomic<uint32_t> dynamicScratchState, exceptionState;
@@ -322,11 +322,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   Signal* exception_signal_;
 
   // CU mask lock
-  KernelMutex mask_lock_;
+  std::mutex mask_lock_;
 
   // Mutex to prevent AsyncReclaimScratch and HandleInsufficientScratch from
   // happening at the same time.
-  KernelMutex scratch_lock_;
+  std::mutex scratch_lock_;
 
   // Current CU mask
   std::vector<uint32_t> cu_mask_;
@@ -345,10 +345,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   }
 
   // Mutex for queue_event_ manipulation
-KernelMutex& queue_lock() {
+std::mutex& queue_lock() {
   // This allocation is meant to last until the last thread has exited.
   // It is intentionally not freed.
-  static KernelMutex* queue_lock_ = new KernelMutex();
+  static std::mutex* queue_lock_ = new std::mutex();
   return *queue_lock_;
 }
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
index 7d820c8437..dfc9ed1006 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
@@ -255,7 +255,7 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
 
   // Internal signals for blocking APIs
   core::unique_signal_ptr signals_[2];
-  KernelMutex lock_;
+  std::mutex lock_;
   bool parity_;
 
   /// Queue resource descriptor for doorbell, read
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h
index bfa080cf8c..26c72136c1 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h
@@ -127,7 +127,7 @@ class CpuAgent : public core::Agent {
   }
 
   // @brief Override from core::Agent.
-  const std::vector<const core::MemoryRegion*>& regions() const override {
+  const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
     return regions_;
   }
 
@@ -151,7 +151,7 @@ class CpuAgent : public core::Agent {
   // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
   // region returns ::HSA_STATUS_SUCCESS.
   hsa_status_t VisitRegion(
-      const std::vector<const core::MemoryRegion*>& regions,
+      const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
       hsa_status_t (*callback)(hsa_region_t region, void* data),
       void* data) const;
 
@@ -166,7 +166,7 @@ class CpuAgent : public core::Agent {
   std::vector<std::unique_ptr<core::Cache>> caches_;
 
   // @brief Array of regions owned by this agent.
-  std::vector<const core::MemoryRegion*> regions_;
+  std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
 
   DISALLOW_COPY_AND_ASSIGN(CpuAgent);
 };
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index d49c4fdd8a..c799cd8611 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -394,7 +394,7 @@ class GpuAgent : public GpuAgentInt {
   }
 
   // @brief Override from core::Agent.
-  const std::vector<const core::MemoryRegion*>& regions() const override {
+  const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
     return regions_;
   }
 
@@ -536,7 +536,7 @@ class GpuAgent : public GpuAgentInt {
   // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
   // region returns ::HSA_STATUS_SUCCESS.
   hsa_status_t VisitRegion(
-      const std::vector<const core::MemoryRegion*>& regions,
+      const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
       hsa_status_t (*callback)(hsa_region_t region, void* data),
       void* data) const;
 
@@ -594,7 +594,7 @@ class GpuAgent : public GpuAgentInt {
   std::vector<const core::Agent*> xgmi_peer_list_;
 
   // Protects xgmi_peer_list_
-  KernelMutex xgmi_peer_list_lock_;
+  std::mutex xgmi_peer_list_lock_;
 
   // @brief AQL queues for cache management and blit compute usage.
   enum QueueEnum {
@@ -607,19 +607,19 @@ class GpuAgent : public GpuAgentInt {
   lazy_ptr<core::Queue> queues_[QueueCount];
 
   // @brief Mutex to protect the update to coherency type.
-  KernelMutex coherency_lock_;
+  std::mutex coherency_lock_;
 
   // @brief Mutex to protect access to scratch pool.
-  KernelMutex scratch_lock_;
+  std::mutex scratch_lock_;
 
   // @brief Mutex to protect access to ::t1_.
-  KernelMutex t1_lock_;
+  std::mutex t1_lock_;
 
   // @brief Mutex to protect access to blit objects.
-  KernelMutex blit_lock_;
+  std::mutex blit_lock_;
 
   // @brief Mutex to protect sdma gang submissions.
-  KernelMutex sdma_gang_lock_;
+  std::mutex sdma_gang_lock_;
 
   // @brief GPU tick on initialization.
   HsaClockCounters t0_;
@@ -638,7 +638,7 @@ class GpuAgent : public GpuAgentInt {
   std::vector<std::unique_ptr<core::Cache>> caches_;
 
   // @brief Array of regions owned by this agent.
-  std::vector<const core::MemoryRegion*> regions_;
+  std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
 
   core::Isa* isa_;
 
@@ -729,7 +729,7 @@ class GpuAgent : public GpuAgentInt {
   struct {
     lazy_ptr<core::Queue> queue_;
     int ref_ct_;
-    KernelMutex lock_;
+    std::mutex lock_;
   } gws_queue_;
 
   // @brief list of AQL queues owned by this agent. Indexed by queue pointer
@@ -763,7 +763,7 @@ class GpuAgent : public GpuAgentInt {
   /// @brief Coarse-grain deallocator on this GPU.
   std::function<void(void*)> coarsegrain_deallocator_;
 
-  void* trap_handler_tma_region_;
+  std::unique_ptr<void, std::function<void(void*)>> trap_handler_tma_region_;
 
   /* PC Sampling fields - begin */
   /* 2nd level Trap handler code is based on the offsets within this structure */
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp
index d44f4d095b..7027ee3364 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp
@@ -181,7 +181,7 @@ namespace code {
       std::vector<Segment*> dataSegments;
       std::vector<Section*> dataSections;
       std::vector<RelocationSection*> relocationSections;
-      std::vector<Symbol*> symbols;
+      std::vector<std::shared_ptr<Symbol>> symbols;
       bool combineDataSegments;
       Segment* hsaSegments[AMDGPU_HSA_SEGMENT_LAST][2];
       Section* hsaSections[AMDGPU_HSA_SECTION_LAST];
@@ -234,7 +234,7 @@ namespace code {
       uint32_t OsAbi() const { return img->OsAbi(); }
 
       AmdHsaCode(bool combineDataSegments = true);
-      virtual ~AmdHsaCode();
+      virtual ~AmdHsaCode() = default;
 
       std::string output() { return out.str(); }
       bool LoadFromFile(const std::string& filename);
@@ -347,7 +347,7 @@ namespace code {
       RelocationSection* GetRelocationSection(size_t i) { return relocationSections[i]; }
 
       size_t SymbolCount() { return symbols.size(); }
-      Symbol* GetSymbol(size_t i) { return symbols[i]; }
+      Symbol* GetSymbol(size_t i) { return symbols[i].get(); }
       Symbol* GetSymbolByElfIndex(size_t index);
       Symbol* FindSymbol(const std::string &n);
 
@@ -362,11 +362,11 @@ namespace code {
 
     class AmdHsaCodeManager {
     private:
-      typedef std::unordered_map<uint64_t, AmdHsaCode*> CodeMap;
+      typedef std::unordered_map<uint64_t, std::shared_ptr<AmdHsaCode>> CodeMap;
       CodeMap codeMap;
 
     public:
-      AmdHsaCode* FromHandle(hsa_code_object_t handle);
+      const std::shared_ptr<AmdHsaCode>& FromHandle(hsa_code_object_t handle);
       bool Destroy(hsa_code_object_t handle);
     };
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp
index 5625e2e1a6..3460b83f24 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp
@@ -422,7 +422,7 @@ private:
   Executable(const Executable &e);
   Executable& operator=(const Executable &e);
 
-  static std::vector<Executable*> executables;
+  static std::vector<std::shared_ptr<Executable>> executables;
   static std::mutex executables_mutex;
 };
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h
index 82b110d70d..d52ba26ddc 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h
@@ -187,7 +187,7 @@ private:
 
   // Protects against concurrent allow_access calls to fragments of the same block by virtue of all
   // fragments of the block routing to the same MemoryRegion.
-  mutable KernelMutex access_lock_;
+  mutable std::mutex access_lock_;
 
   static const size_t kPageSize_;
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h
index 95d7259b30..f7e1d18fec 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h
@@ -216,7 +216,7 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi
 
  private:
   // Serialize packet interception processing.
-  KernelMutex lock_;
+  std::mutex lock_;
 
   // Largest processed packet index.
   uint64_t next_packet_;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h
index 0d2e0ae445..8d565093b3 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h
@@ -103,7 +103,7 @@ class IPCSignal : private SharedMemorySignal, public BusyWaitSignal {
     static int rtti_id_ = 0;
       return rtti_id_;
   }
-  static KernelMutex lock_;
+  static std::mutex lock_;
 
   explicit IPCSignal(SharedMemorySignal&& abi_block)
       : SharedMemorySignal(std::move(abi_block)), BusyWaitSignal(signal(), true) {}
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h
index 800bc94ca5..4aa92ae95d 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h
@@ -51,6 +51,7 @@
 #include <tuple>
 #include <utility>
 #include <thread>
+#include <shared_mutex>
 #if defined(__linux__)
 #include <sys/un.h>
 #include <xf86drm.h>
@@ -437,15 +438,15 @@ class Runtime {
 
   Agent* region_gpu() { return region_gpu_; }
 
-  const std::vector<const MemoryRegion*>& system_regions_fine() const {
+  const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_fine() const {
     return system_regions_fine_;
   }
 
-  const std::vector<const MemoryRegion*>& system_regions_coarse() const {
+  const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_coarse() const {
     return system_regions_coarse_;
   }
 
-  amd::hsa::loader::Loader* loader() { return loader_; }
+  amd::hsa::loader::Loader* loader() { return loader_.get(); }
 
   amd::LoaderContext* loader_context() { return &loader_context_; }
 
@@ -719,10 +720,10 @@ class Runtime {
 
   // Will be created before any user could call hsa_init but also could be
   // destroyed before incorrectly written programs call hsa_shutdown.
-  static __forceinline KernelMutex& bootstrap_lock() {
+  static __forceinline std::mutex& bootstrap_lock() {
     // This allocation is meant to last until the last thread has exited.
     // It is intentionally not freed.
-    static KernelMutex* bootstrap_lock_ = new KernelMutex;
+    static std::mutex* bootstrap_lock_ = new std::mutex;
     return *bootstrap_lock_;
   }
   Runtime();
@@ -780,7 +781,7 @@ class Runtime {
   // Also ensures atomicity of pointer info queries by interlocking
   // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
   // registered & mapped arrays.
-  KernelSharedMutex memory_lock_;
+  std::shared_mutex memory_lock_;
 
   // Array containing driver interfaces for compatible agent kernel-mode
   // drivers. Currently supports AIE agents.
@@ -811,16 +812,16 @@ class Runtime {
   std::vector<uint32_t> gpu_ids_;
 
   // List of all fine grain system memory region in the platform.
-  std::vector<const MemoryRegion*> system_regions_fine_;
+  std::vector<std::shared_ptr<const MemoryRegion>> system_regions_fine_;
 
   // List of all coarse grain system memory region in the platform.
-  std::vector<const MemoryRegion*> system_regions_coarse_;
+  std::vector<std::shared_ptr<const MemoryRegion>> system_regions_coarse_;
 
   // Matrix of IO link.
   std::vector<LinkInfo> link_matrix_;
 
   // Loader instance.
-  amd::hsa::loader::Loader* loader_;
+  std::unique_ptr<amd::hsa::loader::Loader> loader_;
 
   // Loader context.
   amd::LoaderContext loader_context_;
@@ -832,7 +833,7 @@ class Runtime {
   std::map<const void*, AllocationRegion> allocation_map_;
 
   // Pending prefetch containers.
-  KernelMutex prefetch_lock_;
+  std::mutex prefetch_lock_;
   prefetch_map_t prefetch_map_;
 
   // Allocator using ::system_region_
@@ -853,24 +854,29 @@ class Runtime {
   // Number of Numa Nodes
   size_t num_nodes_;
 
+  struct HsaEventDeleter {
+    void operator()(HsaEvent* event) { InterruptSignal::DestroyEvent(event); }
+  };
+  using unique_hsa_event_ptr = std::unique_ptr<HsaEvent, HsaEventDeleter>;
+
   // @brief AMD HSA event to monitor for virtual memory access fault.
-  HsaEvent* vm_fault_event_;
+  unique_hsa_event_ptr vm_fault_event_;
 
   // @brief HSA signal to contain the VM fault event.
-  Signal* vm_fault_signal_;
+  unique_signal_ptr vm_fault_signal_;
 
   // @brief AMD HSA event to monitor for HW exceptions.
-  HsaEvent* hw_exception_event_;
+  unique_hsa_event_ptr hw_exception_event_;
 
   // @brief HSA signal to contain the HW exceptionevent.
-  Signal* hw_exception_signal_;
+  unique_signal_ptr hw_exception_signal_;
 
   // Custom system event handlers.
   std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
       system_event_handlers_;
 
   // System event handler lock
-  KernelMutex system_event_lock_;
+  std::mutex system_event_lock_;
 
   // Internal queue creation notifier
   AMD::callback_t<hsa_amd_runtime_queue_notifier> internal_queue_create_notifier_;
@@ -898,8 +904,8 @@ class Runtime {
 
   // IPC DMA buf unix domain socket server dmabuf FD passing
   int ipc_sock_server_fd_;
-  std::map<uint64_t, size_t> ipc_sock_server_conns_;
-  KernelMutex ipc_sock_server_lock_;
+  std::map<uint64_t, int> ipc_sock_server_conns_;
+  std::mutex ipc_sock_server_lock_;
 
  private:
   void CheckVirtualMemApiSupport();
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h
index 4647604223..1dd9260ae6 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h
@@ -50,6 +50,7 @@
 #include <memory>
 #include <vector>
 #include <utility>
+#include <mutex>
 
 #include "hsakmt/hsakmt.h"
 
@@ -499,7 +500,7 @@ class Signal {
   core::Agent* async_copy_agent_;
 
  private:
-  static KernelMutex ipcLock_;
+  static std::mutex ipcLock_;
   static std::map<decltype(hsa_signal_t::handle), Signal*> ipcMap_;
 
   static Signal* lookupIpc(hsa_signal_t signal);
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index c66273ebb8..af7b7a1bf5 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -66,7 +66,6 @@ AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props)
 }
 
 AieAgent::~AieAgent() {
-  std::for_each(regions_.begin(), regions_.end(), DeleteObject());
   regions_.clear();
 }
 
@@ -75,8 +74,8 @@ hsa_status_t AieAgent::VisitRegion(bool include_peer,
                                                             void *data),
                                    void *data) const {
   AMD::callback_t<decltype(callback)> call(callback);
-  for (const auto r : regions_) {
-    hsa_region_t region_handle(core::MemoryRegion::Convert(r));
+  for (const auto& r : regions_) {
+    hsa_region_t region_handle(core::MemoryRegion::Convert(r.get()));
     hsa_status_t err = call(region_handle, data);
     if (err != HSA_STATUS_SUCCESS) {
       return err;
@@ -321,24 +320,25 @@ void AieAgent::InitRegionList() {
   /// explicit sync operations.
   regions_.reserve(3);
   regions_.push_back(
-      new MemoryRegion(false, true, false, false, true, this, sys_mem_props));
+    std::make_shared<MemoryRegion>(false, true, false, false, true, this, sys_mem_props));
   regions_.push_back(
-      new MemoryRegion(false, false, false, false, true, this, dev_mem_props));
-  regions_.push_back(new MemoryRegion(false, false, false, false, true, this,
-                                      other_mem_props));
+    std::make_shared<MemoryRegion>(false, false, false, false, true, this, dev_mem_props));
+  regions_.push_back(
+    std::make_shared<MemoryRegion>(false, false, false, false, true, this, other_mem_props));
 }
 
 void AieAgent::InitAllocators() {
-  for (const auto *region : regions()) {
+  for (const auto& region : regions()) {
     const MemoryRegion *amd_mem_region(
-        static_cast<const MemoryRegion *>(region));
+        static_cast<const MemoryRegion *>(region.get()));
     if (amd_mem_region->kernarg()) {
+      const core::MemoryRegion* region_ptr = region.get();
       system_allocator_ =
-          [region](size_t size, size_t align,
+          [region_ptr](size_t size, size_t align,
                    core::MemoryRegion::AllocateFlags alloc_flags) -> void * {
         void *mem(nullptr);
         return (core::Runtime::runtime_singleton_->AllocateMemory(
-                    region, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
+                    region_ptr, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
                    ? mem
                    : nullptr;
       };
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index 396edeff7e..ff05e8ecc1 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -165,8 +165,8 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
   // Set group and private memory apertures in amd_queue_.
   auto& regions = agent->regions();
 
-  for (auto region : regions) {
-    const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region);
+  for (const auto& region : regions) {
+    const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region.get());
     uint64_t base = amdregion->GetBaseAddress();
 
     if (amdregion->IsLDS()) {
@@ -217,7 +217,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
   }
 
   MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() {
-    ScopedAcquire<KernelMutex> _lock(&queue_lock());
+    std::lock_guard<std::mutex> _lock(queue_lock());
     queue_count()--;
     if (queue_count() == 0) {
       core::InterruptSignal::DestroyEvent(queue_event());
@@ -232,7 +232,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
   });
 
   if (core::g_use_interrupt_wait) {
-    ScopedAcquire<KernelMutex> _lock(&queue_lock());
+    std::lock_guard<std::mutex> _lock(queue_lock());
     queue_count()++;
     if (queue_event() == nullptr) {
       assert(queue_count() == 1 && "Inconsistency in queue event reference counting found.\n");
@@ -387,7 +387,7 @@ AqlQueue::~AqlQueue() {
   FreeQueueMemory();
 
   if (core::g_use_interrupt_wait) {
-    ScopedAcquire<KernelMutex> lock(&queue_lock());
+    std::lock_guard<std::mutex> lock(queue_lock());
     queue_count()--;
     if (queue_count() == 0) {
       core::InterruptSignal::DestroyEvent(queue_event());
@@ -777,7 +777,7 @@ void AqlQueue::AsyncReclaimMainScratch() {
   tool::notify_event_scratch_async_reclaim_start(public_handle(),
                                                  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE);
 
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
 
   // Unmap the queue. CP will check amd_queue_ fields on re-map
   Suspend();
@@ -849,7 +849,7 @@ void AqlQueue::AsyncReclaimAltScratch() {
   tool::notify_event_scratch_async_reclaim_start(public_handle(),
                                                  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT);
 
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
 
   // Unmap the queue. CP will check amd_queue_ fields on re-map
   Suspend();
@@ -1014,7 +1014,7 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
   const uint64_t device_size = size_per_thread * lanes_per_wave * device_slots;
   const uint64_t dispatch_size = size_per_thread * lanes_per_wave * dispatch_slots;
 
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
 
   // scratch.use_alt_limit will be 0 if alt scratch is not supported or disabled
   if (dispatch_size < scratch.use_alt_limit && dispatch_slots < device_slots) {
@@ -1393,7 +1393,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
   if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask;
 
   // Apply mask if non-default or not queue initialization.
-  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  std::lock_guard<std::mutex> lock(mask_lock_);
   if ((!cu_mask_.empty()) || (num_cu_mask_count != 0) || (!global_mask.empty())) {
 
     // Devices with WGPs must conform to even-indexed contiguous pairwise CU enablement.
@@ -1414,7 +1414,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
 }
 
 hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) {
-  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  std::lock_guard<std::mutex> lock(mask_lock_);
   assert(!cu_mask_.empty() && "No current cu_mask!");
 
   uint32_t user_dword_count = num_cu_mask_count / 32;
@@ -1440,7 +1440,7 @@ void AqlQueue::SetProfiling(bool enabled) {
 void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope_t acquireFence,
                           hsa_fence_scope_t releaseFence, hsa_signal_t* in_signal) {
   // pm4_ib_buf_ is a shared resource, so mutually exclude here.
-  ScopedAcquire<KernelMutex> lock(&pm4_ib_mutex_);
+  std::lock_guard<std::mutex> lock(pm4_ib_mutex_);
 
   // Obtain reference to any container queue.
   core::Queue* queue = core::Queue::Convert(public_handle());
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
index 3067f827d3..d1681f968a 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
@@ -293,7 +293,7 @@ static bool DepSignalCompleteHandler(hsa_signal_value_t signal_value, void *arg
 template <bool useGCR>
 hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd_size,
                                                      uint64_t size) {
-  ScopedAcquire<KernelMutex> lock(&lock_);
+  std::unique_lock<std::mutex> lock(lock_);
 
   // Alternate between completion signals
   // Using two allows overlapping command writing and copies
@@ -310,7 +310,7 @@ hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd
   // Mark signal as in use, guard against exception leaving the signal in an unusable state.
   completionSignal->StoreRelaxed(2);
   MAKE_SCOPE_GUARD([&]() { completionSignal->StoreRelaxed(0); });
-  lock.Release();
+  lock.unlock();
 
   std::vector<core::Signal*> gang_signals(0);
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp
index 37eda03d85..5ad9730344 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp
@@ -64,7 +64,6 @@ CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props,
 }
 
 CpuAgent::~CpuAgent() {
-  std::for_each(regions_.begin(), regions_.end(), DeleteObject());
   regions_.clear();
 }
 
@@ -87,17 +86,17 @@ void CpuAgent::InitRegionList() {
     if (system_prop != mem_props.end()) system_props = *system_prop;
 
     // Fine-Grain Memory
-    regions_.push_back(new MemoryRegion(true, false, is_apu_node, false, true, this, system_props));
+    regions_.push_back(std::make_shared<MemoryRegion>(true, false, is_apu_node, false, true, this, system_props));
 
     // Ext-Fine-Grain Memory
-    regions_.push_back(new MemoryRegion(false, false, is_apu_node, true, true, this, system_props));
+    regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, true, true, this, system_props));
 
     // Kernargs
-    regions_.push_back(new MemoryRegion(true, true, is_apu_node, false, true, this, system_props));
+    regions_.push_back(std::make_shared<MemoryRegion>(true, true, is_apu_node, false, true, this, system_props));
 
     if (!is_apu_node) {
       // Coarse Grain
-      regions_.push_back(new MemoryRegion(false, false, is_apu_node, false, true, this, system_props));
+      regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, false, true, this, system_props));
     }
   }
 }
@@ -150,12 +149,12 @@ hsa_status_t CpuAgent::VisitRegion(bool include_peer,
 }
 
 hsa_status_t CpuAgent::VisitRegion(
-    const std::vector<const core::MemoryRegion*>& regions,
+    const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
     hsa_status_t (*callback)(hsa_region_t region, void* data),
     void* data) const {
-  for (const core::MemoryRegion* region : regions) {
+  for (const std::shared_ptr<const rocr::core::MemoryRegion>& region : regions) {
     if (!region->user_visible()) continue;
-    hsa_region_t region_handle = core::MemoryRegion::Convert(region);
+    hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
     hsa_status_t status = callback(region_handle, data);
     if (status != HSA_STATUS_SUCCESS) {
       return status;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 2537bb8256..cf4e90a5ab 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -112,7 +112,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
       scratch_limit_async_threshold_(0),
       scratch_cache_(
           [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
-      trap_handler_tma_region_(NULL),
+      trap_handler_tma_region_(nullptr, [this](void* ptr){
+        if (ptr && this->finegrain_allocator_) this->finegrain_deallocator()(ptr);
+      }),
       rec_sdma_eng_override_(false),
       pcs_hosttrap_data_(),
       pcs_stochastic_data_(),
@@ -246,7 +248,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
 GpuAgent::~GpuAgent() {
   for (auto& blit : blits_) blit.reset();
 
-  std::for_each(regions_.begin(), regions_.end(), DeleteObject());
   regions_.clear();
 }
 
@@ -454,22 +455,20 @@ void GpuAgent::InitRegionList() {
           memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
         case HSA_HEAPTYPE_GPU_LDS:
         case HSA_HEAPTYPE_GPU_SCRATCH: {
-          MemoryRegion* region =
-              new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]);
-
+          std::shared_ptr<MemoryRegion> region = std::make_shared<MemoryRegion>(false, false, false, false, true, this, mem_props[mem_idx]);
           regions_.push_back(region);
 
           if (region->IsLocalMemory()) {
             // Extended Fine-Grain memory
             if (!(isa_->GetMajorVersion() == 12 && isa_->GetMinorVersion() == 0))
               regions_.push_back(
-                  new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx]));
+                  std::make_shared<MemoryRegion>(false, false, false, true, true, this, mem_props[mem_idx]));
 
             // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
             bool user_visible = (properties_.HiveID != 0) ||
                 core::Runtime::runtime_singleton_->flag().fine_grain_pcie();
 
-            regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this,
+            regions_.push_back(std::make_shared<MemoryRegion>(true, false, false, false, user_visible, this,
                                                 mem_props[mem_idx]));
           }
           break;
@@ -561,7 +560,7 @@ void GpuAgent::ReserveScratch()
   size_t available;
   hsa_status_t err = driver().AvailableMemory(node_id(), &available);
   assert(err == HSA_STATUS_SUCCESS && "AvailableMemory failed");
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
   if (!scratch_cache_.reserved_bytes() && reserved_sz && available > 8 * reserved_sz) {
     HSAuint64 alt_va;
     void* reserved_base = scratch_pool_.alloc(reserved_sz);
@@ -676,20 +675,20 @@ hsa_status_t GpuAgent::VisitRegion(bool include_peer,
 }
 
 hsa_status_t GpuAgent::VisitRegion(
-    const std::vector<const core::MemoryRegion*>& regions,
+    const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
     hsa_status_t (*callback)(hsa_region_t region, void* data),
     void* data) const {
   AMD::callback_t<decltype(callback)> call(callback);
-  for (const core::MemoryRegion* region : regions) {
+  for (const auto& region : regions) {
     if (!region->user_visible()) continue;
 
     const AMD::MemoryRegion* amd_region =
-        reinterpret_cast<const AMD::MemoryRegion*>(region);
+        reinterpret_cast<const AMD::MemoryRegion*>(region.get());
 
     // Only expose system, local, and LDS memory.
     if (amd_region->IsSystem() || amd_region->IsLocalMemory() ||
         amd_region->IsLDS()) {
-      hsa_region_t region_handle = core::MemoryRegion::Convert(region);
+      hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
       hsa_status_t status = call(region_handle, data);
       if (status != HSA_STATUS_SUCCESS) {
         return status;
@@ -910,7 +909,7 @@ void GpuAgent::InitGWS() {
 }
 
 void GpuAgent::GWSRelease() {
-  ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
+  std::lock_guard<std::mutex> lock(gws_queue_.lock_);
   gws_queue_.ref_ct_--;
   if (gws_queue_.ref_ct_ != 0) return;
   InitGWS();
@@ -968,22 +967,22 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
 }
 
 void GpuAgent::SetCopyRequestRefCount(bool set) {
-  ScopedAcquire<KernelMutex> lock(&blit_lock_);
+  std::unique_lock<std::mutex> lock(blit_lock_);
   while (pending_copy_stat_check_ref_) {
-    blit_lock_.Release();
+    lock.unlock();
     os::YieldThread();
-    blit_lock_.Acquire();
+    lock.lock();
   }
   if (!set && pending_copy_req_ref_) pending_copy_req_ref_--;
   else pending_copy_req_ref_++;
 }
 
 void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
-  ScopedAcquire<KernelMutex> lock(&blit_lock_);
+  std::unique_lock<std::mutex> lock(blit_lock_);
   while (pending_copy_req_ref_) {
-    blit_lock_.Release();
+    lock.unlock();
     os::YieldThread();
-    blit_lock_.Acquire();
+    lock.lock();
   }
   if (!set && pending_copy_stat_check_ref_) pending_copy_stat_check_ref_--;
   else pending_copy_stat_check_ref_++;
@@ -1059,7 +1058,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                       std::min(gang_factor, properties_.NumSdmaXgmiEngines);
   }
 
-  ScopedAcquire<KernelMutex> lock(&sdma_gang_lock_);
+  std::lock_guard<std::mutex> lock(sdma_gang_lock_);
   // Manage internal gang signals
   std::vector<core::Signal*> gang_signals;
   if (gang_factor > 1) {
@@ -1642,7 +1641,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
 
       if (status != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
-      for (auto r : regions()) availableBytes += ((AMD::MemoryRegion*)r)->GetCacheSize();
+      for (const auto& r : regions()) availableBytes += ((AMD::MemoryRegion*)(r.get()))->GetCacheSize();
 
       availableBytes += scratch_cache_.free_bytes() - scratch_cache_.reserved_bytes();
 
@@ -1730,7 +1729,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
                                    core::Queue** queue) {
   // Handle GWS queues.
   if (queue_type == HSA_QUEUE_TYPE_COOPERATIVE) {
-    ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
+    std::lock_guard<std::mutex> lock(gws_queue_.lock_);
     auto ret = (*gws_queue_.queue_).get();
     if (ret != nullptr) {
       gws_queue_.ref_ct_++;
@@ -1876,7 +1875,7 @@ void GpuAgent::AcquireQueueMainScratch(ScratchInfo& scratch) {
   */
   bool large;
 
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
   const size_t small_limit = scratch_pool_.size() >> 3;
   bool use_reclaim = true;
 
@@ -2035,7 +2034,7 @@ void GpuAgent::AcquireQueueAltScratch(ScratchInfo& scratch) {
   uint64_t size_per_wave = AlignUp(scratch.alt_size_per_thread * properties_.WaveFrontSize, 1024);
   if (size_per_wave > MAX_WAVE_SCRATCH) return;
 
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
 
   // Ensure mapping will be in whole pages.
   scratch.alt_size = AlignUp(scratch.alt_size, 4096);
@@ -2176,7 +2175,7 @@ uint64_t GpuAgent::TranslateTime(uint64_t tick) {
   // Limit errors due to relative frequency drift to ~0.5us.  Sync clocks at 16Hz.
   const int64_t max_extrapolation = core::Runtime::runtime_singleton_->sys_clock_freq() >> 4;
 
-  ScopedAcquire<KernelMutex> lock(&t1_lock_);
+  std::lock_guard<std::mutex> lock(t1_lock_);
   // Limit errors due to correlated pair certainty to ~0.5us.
   // extrapolated time < (0.5us / half clock read certainty) * delay between clock measures
   // clock read certainty is <4us.
@@ -2261,26 +2260,27 @@ hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttra
     ((uint64_t*)tma_region_host)[1] = (uint64_t)pcs_stochastic_buffers;
 
     if (!trap_handler_tma_region_) {
-      trap_handler_tma_region_ = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
-      if (trap_handler_tma_region_ == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+      void* mem = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
+      if (!mem) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+      trap_handler_tma_region_.reset(mem);
 
       // NearestCpuAgent owns pool returned system_allocator()
       auto cpuAgent = GetNearestCpuAgent()->public_handle();
 
       hsa_status_t ret =
-          AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_);
+          AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_.get());
       assert(ret == HSA_STATUS_SUCCESS);
     }
 
     /* On non-large BAR systems, we may not be able to access device memory, so do a DmaCopy */
-    if (DmaCopy(trap_handler_tma_region_, tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
+    if (DmaCopy(trap_handler_tma_region_.get(), tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
       return HSA_STATUS_ERROR;
 
     tma_size = 2 * sizeof(uint64_t);
-    tma_addr = trap_handler_tma_region_;
+    tma_addr = trap_handler_tma_region_.get();
   } else if (trap_handler_tma_region_) {
-    finegrain_deallocator()(trap_handler_tma_region_);
-    trap_handler_tma_region_ = NULL;
+    trap_handler_tma_region_.reset(nullptr);
   }
 
   // Bind the trap handler to this node.
@@ -2398,7 +2398,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
   uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
   assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen"));
 
-  ScopedAcquire<KernelMutex> lock(&xgmi_peer_list_lock_);
+  std::lock_guard<std::mutex> lock(xgmi_peer_list_lock_);
 
   for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
     uint64_t dst_handle = dst_agent.public_handle().handle;
@@ -2490,19 +2490,20 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
 void GpuAgent::Trim() {
   Agent::Trim();
   AsyncReclaimScratchQueues();
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
   scratch_cache_.trim(false);
 }
 
 void GpuAgent::InitAllocators() {
-  for (auto pool : GetNearestCpuAgent()->regions()) {
+  for (const auto& pool : GetNearestCpuAgent()->regions()) {
     if (pool->kernarg()) {
-      system_allocator_ = [pool](size_t size, size_t alignment,
+      const core::MemoryRegion* pool_ptr = pool.get();
+      system_allocator_ = [pool_ptr](size_t size, size_t alignment,
                                  MemoryRegion::AllocateFlags alloc_flags) -> void* {
         assert(alignment <= 4096);
         void* ptr = nullptr;
         return (HSA_STATUS_SUCCESS ==
-                core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, &ptr))
+                core::Runtime::runtime_singleton_->AllocateMemory(pool_ptr, size, alloc_flags, &ptr))
             ? ptr
             : nullptr;
       };
@@ -2513,14 +2514,14 @@ void GpuAgent::InitAllocators() {
   assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool.");
 
   // Setup this GPU's fine-grain and coarse-grain allocators.
-  for (auto region : regions()) {
-    const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region);
+  for (const auto& region : regions()) {
+    const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region.get());
 
-    auto region_allocator = [region](size_t size,
+    auto region_allocator = [amd_region](size_t size,
                                      MemoryRegion::AllocateFlags alloc_flags) -> void* {
       void* ptr = nullptr;
        return (HSA_STATUS_SUCCESS ==
-               core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr))
+               core::Runtime::runtime_singleton_->AllocateMemory(amd_region, size, alloc_flags, &ptr))
            ? ptr
            : nullptr;
     };
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp
index 51ad5dc04a..a76005656f 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp
@@ -283,18 +283,18 @@ const core::MemoryRegion* RegionMemory::AgentLocal(hsa_agent_t agent, bool is_co
   assert(amd_agent->device_type() == core::Agent::kAmdGpuDevice && "Invalid agent type.");
   auto agent_local_region =
       std::find_if(amd_agent->regions().begin(), amd_agent->regions().end(),
-                   [&](const core::MemoryRegion* region) {
-                     const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region;
+                   [&](const std::shared_ptr<const core::MemoryRegion>& region) {
+                     const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region.get();
                      return amd_region->IsLocalMemory() && (!amd_region->fine_grain());
                    });
-  return agent_local_region == amd_agent->regions().end() ? nullptr : *agent_local_region;
+  return agent_local_region == amd_agent->regions().end() ? nullptr : agent_local_region->get();
 }
 
 const core::MemoryRegion* RegionMemory::System(bool is_code) {
   if (is_code)
-    return core::Runtime::runtime_singleton_->system_regions_coarse()[0];
+    return core::Runtime::runtime_singleton_->system_regions_coarse()[0].get();
   else
-    return core::Runtime::runtime_singleton_->system_regions_fine()[0];
+    return core::Runtime::runtime_singleton_->system_regions_fine()[0].get();
 }
 
 bool RegionMemory::Allocate(size_t size, size_t align, bool zero) {
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
index 842ef96165..dce3912d58 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
@@ -48,6 +48,8 @@
 #include "core/inc/amd_memory_region.h"
 
 #include <algorithm>
+#include <mutex>
+#include <shared_mutex>
 
 #include "core/inc/runtime.h"
 #include "core/inc/amd_cpu_agent.h"
@@ -132,7 +134,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
 MemoryRegion::~MemoryRegion() {}
 
 hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
-  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
   return AllocateImpl(size, alloc_flags, address, agent_node_id);
 }
 
@@ -160,7 +162,7 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
 }
 
 hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
-  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
   return FreeImpl(address, size);
 }
 
@@ -172,7 +174,7 @@ hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {
 
 // TODO:  Look into a better name and/or making this process transparent to exporting.
 hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const {
-  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
   if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
   return HSA_STATUS_SUCCESS;
 }
@@ -448,7 +450,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
   std::vector<uint64_t> union_agents;
   info.size = sizeof(info);
 
-  ScopedAcquire<KernelMutex> lock(&access_lock_);
+  std::lock_guard<std::mutex> lock(access_lock_);
 
   if (core::Runtime::runtime_singleton_->PtrInfo(const_cast<void*>(ptr), &info, malloc,
                                                  &agent_count, &accessible,
@@ -512,8 +514,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
 
   {  // Sequence with pointer info since queries to other fragments of the block may be adjusted by
      // this call.
-    ScopedAcquire<KernelSharedMutex::Shared> lock(
-        core::Runtime::runtime_singleton_->memory_lock_.shared());
+    std::shared_lock<std::shared_mutex> lock(core::Runtime::runtime_singleton_->memory_lock_);
     uint64_t alternate_va = 0;
     if (owner()->driver().MakeMemoryResident(ptr, size, &alternate_va, &map_flag,
                                              whitelist_nodes.size(),
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp
index 147ce1ba6d..291a4382a9 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp
@@ -1804,7 +1804,7 @@ hsa_status_t hsa_code_object_serialize(
   IS_BAD_PTR(serialized_code_object);
   IS_BAD_PTR(serialized_code_object_size);
 
-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
   if (!code) {
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
@@ -1982,7 +1982,7 @@ hsa_status_t hsa_code_object_get_info(
   IS_OPEN();
   IS_BAD_PTR(value);
 
-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
   if (!code) {
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
@@ -2039,7 +2039,7 @@ hsa_status_t hsa_code_object_get_symbol(
   IS_BAD_PTR(symbol_name);
   IS_BAD_PTR(symbol);
 
-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
   if (!code) {
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
@@ -2059,7 +2059,7 @@ hsa_status_t hsa_code_object_get_symbol_from_name(
   IS_BAD_PTR(symbol_name);
   IS_BAD_PTR(symbol);
 
-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
   if (!code) {
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
@@ -2097,7 +2097,7 @@ hsa_status_t hsa_code_object_iterate_symbols(
   IS_OPEN();
   IS_BAD_PTR(callback);
 
-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
   if (!code) {
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
index 9e71e71f94..b81c919eeb 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
@@ -759,7 +759,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
   }
 
   const AMD::MemoryRegion* system_region = static_cast<const AMD::MemoryRegion*>(
-      core::Runtime::runtime_singleton_->system_regions_coarse()[0]);
+      core::Runtime::runtime_singleton_->system_regions_coarse()[0].get());
 
   return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr);
   CATCH;
@@ -799,7 +799,7 @@ hsa_status_t hsa_amd_memory_unlock(void* host_ptr) {
 
   const AMD::MemoryRegion* system_region =
       reinterpret_cast<const AMD::MemoryRegion*>(
-          core::Runtime::runtime_singleton_->system_regions_fine()[0]);
+          core::Runtime::runtime_singleton_->system_regions_fine()[0].get());
 
   return system_region->Unlock(host_ptr);
   CATCH;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp
index ebae2fce0e..d1eef19208 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp
@@ -340,7 +340,7 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
     return;
   }
 
-  ScopedAcquire<KernelMutex> lock(&lock_);
+  std::lock_guard<std::mutex> lock(lock_);
 
   // Submit overflow packets.
   if (!overflow_.empty()) {
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp
index bf0d7179d7..c1b4b21b67 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp
@@ -48,7 +48,7 @@ namespace rocr {
 namespace core {
 
 HsaEvent* InterruptSignal::EventPool::alloc() {
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
   if (events_.empty()) {
     if (!allEventsAllocated) {
       HsaEvent* evt = InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false);
@@ -64,7 +64,7 @@ HsaEvent* InterruptSignal::EventPool::alloc() {
 
 void InterruptSignal::EventPool::free(HsaEvent* evt) {
   if (evt == nullptr) return;
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
   events_.push_back(unique_event_ptr(evt));
 }
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp
index 2c22918053..c8e07fe2c0 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp
@@ -50,7 +50,7 @@
 namespace rocr {
 namespace core {
 
-KernelMutex IPCSignal::lock_;
+std::mutex IPCSignal::lock_;
 
 SharedMemory::SharedMemory(const hsa_amd_ipc_memory_t* handle, size_t len) {
   hsa_status_t err = Runtime::runtime_singleton_->IPCAttach(handle, len, 0, NULL, &ptr_);
@@ -85,7 +85,7 @@ Signal* IPCSignal::Attach(const hsa_amd_ipc_signal_t* ipc_signal_handle) {
 
   hsa_signal_t handle = SharedSignal::Convert(shared.signal());
 
-  ScopedAcquire<KernelMutex> lock(&lock_);
+  std::lock_guard<std::mutex> lock(lock_);
   Signal* ret = core::Signal::DuplicateHandle(handle);
   if (ret == nullptr) ret = new IPCSignal(std::move(shared));
   return ret;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp
index 3d34133c7f..254e7b80d8 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp
@@ -48,6 +48,7 @@
 #include <string>
 #include <vector>
 #include <list>
+#include <shared_mutex>
 #if defined(__linux__)
 #include <link.h>
 #include <dlfcn.h>
@@ -119,7 +120,7 @@ bool g_use_mwaitx;
 Runtime* Runtime::runtime_singleton_ = NULL;
 
 hsa_status_t Runtime::Acquire() {
-  ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
+  std::lock_guard<std::mutex> boot(bootstrap_lock());
 
   if (runtime_singleton_ == NULL) {
     memset(log_flags, 0, sizeof(log_flags));
@@ -146,7 +147,7 @@ hsa_status_t Runtime::Acquire() {
 }
 
 hsa_status_t Runtime::Release() {
-  ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
+  std::lock_guard<std::mutex> boot(bootstrap_lock());
 
   if (runtime_singleton_ == nullptr) return HSA_STATUS_ERROR_NOT_INITIALIZED;
 
@@ -192,7 +193,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
     agents_by_gpuid_[0] = agent;
 
     // Add cpu regions to the system region list.
-    for (const core::MemoryRegion* region : agent->regions()) {
+    for (auto region : agent->regions()) {
       if (region->fine_grain()) {
         system_regions_fine_.push_back(region);
       } else {
@@ -216,7 +217,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
             assert(alignment <= 4096);
             void* ptr = NULL;
             return (HSA_STATUS_SUCCESS ==
-                    core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
+                    core::Runtime::runtime_singleton_->AllocateMemory(pool.get(), size, alloc_flags,
                                                                       &ptr, agent_node_id))
                 ? ptr
                 : NULL;
@@ -336,7 +337,7 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
   hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
   // Track the allocation result so that it could be freed properly.
   if (status == HSA_STATUS_SUCCESS) {
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
     allocation_map_[*address] = AllocationRegion(region, size, size_requested, alloc_flags);
   }
 
@@ -354,7 +355,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
   MemoryRegion::AllocateFlags alloc_flags = core::MemoryRegion::AllocateNoFlags;
 
   {
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
 
     std::map<const void*, AllocationRegion>::iterator it = allocation_map_.find(ptr);
 
@@ -458,7 +459,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
 
 hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_callback_t callback,
                                               void* user_data) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   auto mem = allocation_map_.upper_bound(ptr);
   if (mem != allocation_map_.begin()) {
     mem--;
@@ -482,7 +483,7 @@ hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_ca
 hsa_status_t Runtime::DeregisterReleaseNotifier(void* ptr,
                                                 hsa_amd_deallocation_callback_t callback) {
   hsa_status_t ret = HSA_STATUS_ERROR_INVALID_ARGUMENT;
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   auto mem = allocation_map_.upper_bound(ptr);
   if (mem != allocation_map_.begin()) {
     mem--;
@@ -552,7 +553,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
   // GPU-CPU
   // Must ensure that system memory is visible to the GPU during the copy.
   const AMD::MemoryRegion* system_region =
-      static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0]);
+      static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0].get());
 
   void* gpuPtr = nullptr;
   const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) {
@@ -698,7 +699,7 @@ hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
   size_t alloc_size = 0;
 
   {
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
 
     std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
 
@@ -929,7 +930,7 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents,
   *size = info.SizeInBytes;
   *ptr = info.MemoryAddress;
 
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   allocation_map_[info.MemoryAddress] = AllocationRegion(
       nullptr, info.SizeInBytes, info.SizeInBytes, core::MemoryRegion::AllocateNoFlags);
 
@@ -1055,7 +1056,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
 
   {  // memory_lock protects access to the NMappedNodes array and fragment user data since these may
      // change with calls to memory APIs.
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
 
     if (VMemoryPtrInfo(ptr, &retInfo, alloc, num_agents_accessible, accessible) ==
         HSA_STATUS_SUCCESS) {
@@ -1196,7 +1197,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
 
 hsa_status_t Runtime::SetPtrInfoData(const void* ptr, void* userptr) {
   {  // Use allocation map if possible to handle fragments.
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
     const auto& it = allocation_map_.find(ptr);
     if (it != allocation_map_.end()) {
       it->second.user_ptr = userptr;
@@ -1307,7 +1308,7 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) {
      size_t len = 0;
 
      // Search for registered export pointer
-     ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
+     std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
      for (auto& conns : ipc_sock_server_conns_) {
        if (conn_handle == conns.first) {
          ptr = reinterpret_cast<void *>(conn_handle);
@@ -1372,7 +1373,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
     if (useFrag) {
       handle->handle[6] |= 0x80000000 | fragOffset;
       // Prevent realloction of fragment for better performance.
-      ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
+      std::shared_lock<std::shared_mutex> lock(memory_lock_);
       err = allocation_map_[ptr].region->IPCFragmentExport(ptr);
       assert(err == HSA_STATUS_SUCCESS && "Region inconsistent with address map.");
     }
@@ -1439,7 +1440,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
 
   close(dmabuf_fd);
 
-  ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
+  std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
 #if defined(__linux__)
   if (!ipc_sock_server_conns_.size()) { // create new runtime socket server
     struct sockaddr_un address;
@@ -1549,7 +1550,7 @@ int Runtime::IPCClientImport(uint32_t conn_handle, uint64_t dmabuf_fd_handle,
 
       // Store the buffer object handle in allocation map for later use
       if (err == HSAKMT_STATUS_SUCCESS) {
-        ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+        std::lock_guard<std::shared_mutex> lock(memory_lock_);
         allocation_map_[*importAddress] =
             AllocationRegion(nullptr, *importSize, *importSize, core::MemoryRegion::AllocateNoFlags);
         allocation_map_[*importAddress].ldrm_bo = res.buf_handle;
@@ -1579,7 +1580,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
       importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
       len = Min(len, importSize - fragOffset);
     }
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
     allocation_map_[importAddress] =
         AllocationRegion(nullptr, len, len, core::MemoryRegion::AllocateNoFlags);
     allocation_map_[importAddress].ldrm_bo = ldrm_bo;
@@ -1711,7 +1712,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
 hsa_status_t Runtime::IPCDetach(void* ptr) {
   bool ldrmImportCleaned = false;
   {  // Handle imported fragments.
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::unique_lock<std::shared_mutex> lock(memory_lock_);
     const auto& it = allocation_map_.find(ptr);
     if (it != allocation_map_.end()) {
       if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -1728,7 +1729,7 @@ hsa_status_t Runtime::IPCDetach(void* ptr) {
       assert(!"Unimplemented!");
 #endif
       allocation_map_.erase(it);
-      lock.Release();  // Can't hold memory lock when using pointer info.
+      lock.unlock();  // Can't hold memory lock when using pointer info.
 
       PtrInfoBlockData block = {};
       hsa_amd_pointer_info_t info = {};
@@ -1954,7 +1955,7 @@ void Runtime::AsyncEventsPool::clear() {
 }
 
 Runtime::AsyncEventItem* Runtime::AsyncEventsPool::alloc() {
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
   if (free_list_.empty()) {
     AsyncEventItem* block = reinterpret_cast<AsyncEventItem*>(
         allocate_()(block_size_ * sizeof(AsyncEventItem), __alignof(AsyncEventItem), core::MemoryRegion::AllocateNonPaged, 0));
@@ -1985,7 +1986,7 @@ void Runtime::AsyncEventsPool::free(AsyncEventItem* ptr) {
   if (ptr == nullptr) return;
 
   ptr->~AsyncEventItem();
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
 
   ifdebug {
     bool valid = false;
@@ -2059,33 +2060,33 @@ void Runtime::BindErrorHandlers() {
 
   // Create memory event with manual reset to avoid racing condition
   // with driver in case of multiple concurrent VM faults.
-  vm_fault_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true);
+  vm_fault_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true));
 
   // Create an interrupt signal object to contain the memory event.
   // This signal object will be registered with the async handler global
   // thread.
-  vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_);
+  vm_fault_signal_.reset(new core::InterruptSignal(0, vm_fault_event_.get()));
 
   if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) {
     assert(false && "Failed on creating VM fault signal");
     return;
   }
 
-  SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0,
-                        VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_));
+  SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
+                        VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_.get()));
 
   // Create HW exception event which is for Non-RAS events
-  hw_exception_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true);
+  hw_exception_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true));
 
-  hw_exception_signal_ = new core::InterruptSignal(0, hw_exception_event_);
+  hw_exception_signal_.reset(new core::InterruptSignal(0, hw_exception_event_.get()));
 
   if (!hw_exception_signal_->IsValid() || hw_exception_signal_->EopEvent() == NULL) {
     assert(false && "Failed on creating HW Exception signal");
     return;
   }
 
-  SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_), HSA_SIGNAL_CONDITION_NE, 0,
-                        HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_));
+  SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
+                        HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_.get()));
 }
 
 bool Runtime::HwExceptionHandler(hsa_signal_value_t val, void* arg) {
@@ -2262,7 +2263,8 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
 }
 
 void Runtime::PrintMemoryMapNear(void* ptr) {
-  runtime_singleton_->memory_lock_.Acquire();
+  std::unique_lock<std::shared_mutex> lock(runtime_singleton_->memory_lock_);
+
   auto it = runtime_singleton_->allocation_map_.upper_bound(ptr);
   for (int i = 0; i < 2; i++) {
     if (it != runtime_singleton_->allocation_map_.begin()) it--;
@@ -2287,8 +2289,9 @@ void Runtime::PrintMemoryMapNear(void* ptr) {
     it++;
   }
   fprintf(stderr, "\n");
-  it = start;
-  runtime_singleton_->memory_lock_.Release();
+  it = start;  
+  lock.unlock();
+  
   hsa_amd_pointer_info_t info = {};
   PtrInfoBlockData block = {};
   uint32_t count = 0;
@@ -2408,7 +2411,7 @@ hsa_status_t Runtime::Load() {
 
   BindErrorHandlers();
 
-  loader_ = amd::hsa::loader::Loader::Create(&loader_context_);
+  loader_.reset(amd::hsa::loader::Loader::Create(&loader_context_));
 
   // Load extensions
   LoadExtensions();
@@ -2449,8 +2452,8 @@ void Runtime::Unload() {
   UnloadTools();
   UnloadExtensions();
 
-  amd::hsa::loader::Loader::Destroy(loader_);
-  loader_ = nullptr;
+  amd::hsa::loader::Loader::Destroy(loader_.get());
+  loader_.reset();
 
   for(auto nodeAgent: agents_by_node_) {
     for (auto agent: nodeAgent.second)
@@ -2462,17 +2465,17 @@ void Runtime::Unload() {
 
   if (vm_fault_signal_ != nullptr) {
     vm_fault_signal_->DestroySignal();
-    vm_fault_signal_ = nullptr;
+    vm_fault_signal_.reset();
   }
-  core::InterruptSignal::DestroyEvent(vm_fault_event_);
-  vm_fault_event_ = nullptr;
+  
+  vm_fault_event_.reset();
 
   if (hw_exception_signal_ != nullptr) {
     hw_exception_signal_->DestroySignal();
-    hw_exception_signal_ = nullptr;
+    hw_exception_signal_.reset();
   }
-  core::InterruptSignal::DestroyEvent(hw_exception_event_);
-  hw_exception_event_ = nullptr;
+  
+  hw_exception_event_.reset();
 
   SharedSignalPool.clear();
 
@@ -2890,7 +2893,7 @@ void Runtime::AsyncEvents::Clear() {
 
 hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
                                                   void* data) {
-  ScopedAcquire<KernelMutex> lock(&system_event_lock_);
+  std::lock_guard<std::mutex> lock(system_event_lock_);
   system_event_handlers_.push_back(
       std::make_pair(AMD::callback_t<hsa_amd_system_event_callback_t>(callback), data));
   return HSA_STATUS_SUCCESS;
@@ -2898,7 +2901,7 @@ hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_
 
 std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
 Runtime::GetSystemEventHandlers() {
-  ScopedAcquire<KernelMutex> lock(&system_event_lock_);
+  std::lock_guard<std::mutex> lock(system_event_lock_);
   return system_event_handlers_;
 }
 
@@ -3269,7 +3272,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
   }
 
   {
-    ScopedAcquire<KernelMutex> lock(&prefetch_lock_);
+    std::lock_guard<std::mutex> lock(prefetch_lock_);
     // Remove all fully overlapped and trim partially overlapped ranges.
     // Get iteration bounds
     auto start = prefetch_map_.upper_bound(base);
@@ -3332,7 +3335,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
 
   // Remove the prefetch's ranges from the map.
   static auto removePrefetchRanges = [](PrefetchOp* op) {
-    ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
+    std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
     auto it = op->prefetch_map_entry;
     while (it != Runtime::runtime_singleton_->prefetch_map_.end()) {
       auto next = it->second.next;
@@ -3389,7 +3392,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
 
   std::vector<std::pair<uintptr_t, uintptr_t>> holes;
 
-  ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
+  std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
   auto start = prefetch_map_.upper_bound(base);
   if (start != prefetch_map_.begin()) start--;
   auto stop = prefetch_map_.lower_bound(end);
@@ -3441,7 +3444,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
 hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, uint64_t* offset,
                                    uint64_t flags) {
 #ifdef __linux__
-  ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
+  std::shared_lock<std::shared_mutex> lock(memory_lock_);
   // Lookup containing allocation.
   auto mem = allocation_map_.upper_bound(ptr);
   if (mem != allocation_map_.begin()) {
@@ -3507,7 +3510,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add
 
   if (!alignment) alignment = rocr::os::PageSize();
 
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
 
   if (flags & HSA_AMD_VMEM_ADDRESS_NO_REGISTER) {
     size_t requested = size + alignment - rocr::os::PageSize();
@@ -3548,7 +3551,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add
 }
 
 hsa_status_t Runtime::VMemoryAddressFree(void* va, size_t size) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   std::map<const void*, AddressHandle>::iterator it = reserved_address_map_.find(va);
 
   if (it == reserved_address_map_.end()) {
@@ -3580,7 +3583,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
   if (!IsMultipleOf(size, memRegion->GetPageSize()))
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   ThunkHandle user_mode_driver_handle;
   hsa_status_t status =
       region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0);
@@ -3597,7 +3600,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
 }
 
 hsa_status_t Runtime::VMemoryHandleRelease(hsa_amd_vmem_alloc_handle_t memoryOnlyHandle) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   auto memoryHandleIt = memory_handle_map_.find(MemoryHandle::Convert(memoryOnlyHandle));
 
   if (memoryHandleIt == memory_handle_map_.end()) {
@@ -3628,7 +3631,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
   uint64_t offset = 0, ret;
   uint64_t drm_cpu_addr = 0;
 
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   auto addressHandle = VMemoryFindReservedAddressHandle(va);
   if (addressHandle == nullptr ||
       reinterpret_cast<uint8_t*>(va) + size >
@@ -3703,7 +3706,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
 }
 
 hsa_status_t Runtime::VMemoryHandleUnmap(void* va, size_t size) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
   std::list<std::pair<void*, MappedHandle*>> mappedHandles;
 
   // va + size may consist of multiple MappedHandle's.
@@ -3921,7 +3924,7 @@ hsa_status_t Runtime::VMemorySetAccess(void* va, size_t size,
     if (targetAgent == NULL || !targetAgent->IsValid()) return HSA_STATUS_ERROR_INVALID_AGENT;
   }
 
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
 
   auto addressHandle = VMemoryFindReservedAddressHandle(va);
   if (addressHandle == nullptr ||
@@ -4014,7 +4017,7 @@ hsa_status_t Runtime::VMemoryGetAccess(const void* va, hsa_access_permission_t*
   *perms = HSA_ACCESS_PERMISSION_NONE;
   bool mappedHandleFound = false;
 
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
 
   auto mappedHandleIt = mapped_handle_map_.upper_bound(va);
   if (mappedHandleIt != mapped_handle_map_.begin()) {
@@ -4076,8 +4079,8 @@ hsa_status_t Runtime::VMemoryImportShareableHandle(int dmabuf_fd,
       return;
     }
 
-    for (const core::MemoryRegion* region : agent->regions()) {
-      const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region);
+    for (const auto& region : agent->regions()) {
+      const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region.get());
 
       // TODO: Verify that this works on a system with FINE_GRAINED memory.
       // System's with FINE_GRAINED will have both COARSE and FINE grain... need to get the
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp
index 518b5b121c..2660ddd26c 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp
@@ -58,7 +58,7 @@
 namespace rocr {
 namespace core {
 
-KernelMutex Signal::ipcLock_;
+std::mutex Signal::ipcLock_;
 std::map<decltype(hsa_signal_t::handle), Signal*> Signal::ipcMap_;
 
 void SharedSignalPool_t::clear() {
@@ -76,7 +76,7 @@ void SharedSignalPool_t::clear() {
 }
 
 SharedSignal* SharedSignalPool_t::alloc() {
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
   if (free_list_.empty()) {
     SharedSignal* block = reinterpret_cast<SharedSignal*>(
         allocate_()(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), core::MemoryRegion::AllocateNonPaged, 0));
@@ -109,7 +109,7 @@ void SharedSignalPool_t::free(SharedSignal* ptr) {
   if (ptr == nullptr) return;
 
   ptr->~SharedSignal();
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
 
   ifdebug {
     bool valid = false;
@@ -134,7 +134,7 @@ LocalSignal::LocalSignal(hsa_signal_value_t initial_value, bool exportable)
 }
 
 void Signal::registerIpc() {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
   auto handle = Convert(this);
   assert(ipcMap_.find(handle.handle) == ipcMap_.end() &&
          "Can't register the same IPC signal twice.");
@@ -142,7 +142,7 @@ void Signal::registerIpc() {
 }
 
 bool Signal::deregisterIpc() {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
   if (refcount_ != 0) return false;
   auto handle = Convert(this);
   const auto& it = ipcMap_.find(handle.handle);
@@ -152,14 +152,14 @@ bool Signal::deregisterIpc() {
 }
 
 Signal* Signal::lookupIpc(hsa_signal_t signal) {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
   const auto& it = ipcMap_.find(signal.handle);
   if (it == ipcMap_.end()) return nullptr;
   return it->second;
 }
 
 Signal* Signal::duplicateIpc(hsa_signal_t signal) {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
   const auto& it = ipcMap_.find(signal.handle);
   if (it == ipcMap_.end()) return nullptr;
   it->second->refcount_++;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
index 3c20b88316..a36e29c2cc 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
@@ -125,16 +125,16 @@ template <typename T> class lazy_ptr {
  private:
   mutable std::unique_ptr<T> obj;
   mutable std::function<T*(void)> func;
-  mutable KernelMutex lock;
+  mutable std::mutex lock;
 
   // Separated from make to improve inlining.
   void make_body(bool block) const {
     if (block) {
-      lock.Acquire();
-    } else if (!lock.Try()) {
+      lock.lock();
+    } else if (!lock.try_lock()) {
       return;
     }
-    MAKE_SCOPE_GUARD([&]() { lock.Release(); });
+    MAKE_SCOPE_GUARD([&]() { lock.unlock(); });
     if (func == nullptr) return;
     T* ptr = func();
     obj.reset(ptr);
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h
index e7fa2f0b5e..133dd06f0b 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h
@@ -90,6 +90,11 @@ class HybridMutex {
       os::PostSemaphore(sem_);
   }
 
+  // To add compatibility with std::lock_guard
+  void lock() { Acquire(); }
+  void unlock() { Release(); }
+  bool try_lock() { return Try(); }
+
  private:
   std::atomic<int> lock_;
   os::Semaphore sem_;
@@ -100,27 +105,6 @@ class HybridMutex {
   DISALLOW_COPY_AND_ASSIGN(HybridMutex);
 };
 
-
-/// @brief: a class represents a kernel mutex.
-/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
-/// until the lock is released (Best for long waits, though anything using
-/// a kernel object is a long wait).
-class KernelMutex {
- public:
-  KernelMutex() { lock_ = os::CreateMutex(); }
-  ~KernelMutex() { os::DestroyMutex(lock_); }
-
-  bool Try() { return os::TryAcquireMutex(lock_); }
-  bool Acquire() { return os::AcquireMutex(lock_); }
-  void Release() { os::ReleaseMutex(lock_); }
-
- private:
-  os::Mutex lock_;
-
-  /// @brief: Disable copiable and assignable ability.
-  DISALLOW_COPY_AND_ASSIGN(KernelMutex);
-};
-
 /// @brief: represents a spin lock.
 /// For very short hold durations on the order of the thread scheduling
 /// quanta or less.
@@ -143,6 +127,11 @@ class SpinMutex {
   }
   void Release() { lock_ = 0; }
 
+  // To add compatibility with std::lock_guard
+  void lock() { Acquire(); }
+  void unlock() { Release(); }
+  bool try_lock() { return Try(); }
+
  private:
   std::atomic<int> lock_;
 
@@ -167,124 +156,6 @@ class KernelEvent {
   DISALLOW_COPY_AND_ASSIGN(KernelEvent);
 };
 
-/// @brief: represents a yielding shared mutex.
-/// aka read/write mutex
-class KernelSharedMutex {
- public:
-  /// @brief: Interfaces ScopedAcquire to shared operations.
-  class Shared {
-   public:
-    explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
-    bool Try() { return lock_->TryShared(); }
-    bool Acquire() { return lock_->AcquireShared(); }
-    void Release() { lock_->ReleaseShared(); }
-
-   private:
-    KernelSharedMutex* lock_;
-  };
-
-  KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
-  ~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
-
-  // Exclusive mode operations
-  bool Try() { return os::TryAcquireSharedMutex(lock_); }
-  bool Acquire() { return os::AcquireSharedMutex(lock_); }
-  void Release() { os::ReleaseSharedMutex(lock_); }
-
-  // Shared mode operations
-  bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
-  bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
-  void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
-
-  // Return shared operations interface
-  Shared shared() { return Shared(this); }
-
- private:
-  os::SharedMutex lock_;
-
-  /// @brief: Disable copiable and assignable ability.
-  DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
-};
-
-/// @brief: Type trait to identify mutex types
-template <class T> class isMutex {
- public:
-  enum { value = false };
-};
-template <> class isMutex<HybridMutex> {
- public:
-  enum { value = true };
-};
-template <> class isMutex<KernelMutex> {
- public:
-  enum { value = true };
-};
-template <> class isMutex<SpinMutex> {
- public:
-  enum { value = true };
-};
-template <> class isMutex<KernelSharedMutex> {
- public:
-  enum { value = true };
-};
-
-/// @brief: A class behaves as a lock in a scope. When trying to enter into the
-/// critical section, creat a object of this class. After the control path goes
-/// out of the scope, it will release the lock automatically.
-template <class LockType> class ScopedAcquire {
- public:
-  /// @brief: When constructing, acquire the lock.
-  /// @param: lock(Input), pointer to an existing lock.
-  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
-    static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
-    lock_.Acquire();
-  }
-  explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
-    static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
-    lock_.Acquire();
-  }
-
-  /// @brief: when destructing, release the lock.
-  ~ScopedAcquire() {
-    if (doRelease) lock_.Release();
-  }
-
-  /// @brief: Release the lock early.  Avoid using when possible.
-  void Release() {
-    lock_.Release();
-    doRelease = false;
-  }
-
- private:
-  /// @brief: Adapts between pointers to mutex types and mutex pointer types.
-  template <class T, bool B> class container {
-   public:
-    container(T* lock) : lock_(lock) {}
-    __forceinline bool Acquire() { return lock_->Acquire(); }
-    __forceinline void Release() { return lock_->Release(); }
-
-   private:
-    T* lock_;
-  };
-
-  /// @brief: Specialization for mutex pointer types.
-  template <class T> class container<T, false> {
-   public:
-    container(T lock) : lock_(lock) {}
-    __forceinline bool Acquire() { return lock_.Acquire(); }
-    __forceinline void Release() { return lock_.Release(); }
-
-   private:
-    T lock_;
-  };
-
-  container<LockType, isMutex<LockType>::value> lock_;
-  bool doRelease;
-
-  /// @brief: Disable copiable and assignable ability.
-  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
-};
-
 }  // namespace rocr
 
 #endif  // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp
index 7ce27689ba..500247b537 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp
@@ -286,11 +286,6 @@ namespace code {
       }
     }
 
-    AmdHsaCode::~AmdHsaCode()
-    {
-      for (Symbol* sym : symbols) { delete sym; }
-    }
-
     bool AmdHsaCode::PullElf()
     {
       uint32_t majorVersion, minorVersion;
@@ -330,7 +325,7 @@ namespace code {
       }
       for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) {
         amd::elf::Symbol* elfsym = img->symtab()->symbol(i);
-        Symbol* sym = 0;
+        std::shared_ptr<Symbol> sym;
         switch (elfsym->type()) {
         case STT_AMDGPU_HSA_KERNEL: {
           amd::elf::Section* sec = elfsym->section();
@@ -347,12 +342,12 @@ namespace code {
             out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
             return false;
           }
-          sym = new KernelSymbol(elfsym, &akc);
+          sym = std::make_shared<KernelSymbol>(elfsym, &akc);
           break;
         }
         case STT_OBJECT:
         case STT_COMMON:
-          sym = new VariableSymbol(elfsym);
+          sym = std::make_shared<VariableSymbol>(elfsym);
           break;
         default:
           break; // Skip unknown symbols.
@@ -924,9 +919,9 @@ namespace code {
         std::string(module_name ? module_name : ""),
         std::string(symbol_name)
       );
-      for (Symbol* sym : symbols) {
+      for (const auto& sym : symbols) {
         if (sym->Name() == mname) {
-          *s = Symbol::ToHandle(sym);
+          *s = Symbol::ToHandle(sym.get());
           return HSA_STATUS_SUCCESS;
         }
       }
@@ -940,8 +935,8 @@ namespace code {
                                   void* data),
                                 void* data)
     {
-      for (Symbol* sym : symbols) {
-        hsa_code_symbol_t s = Symbol::ToHandle(sym);
+      for (const auto& sym : symbols) {
+        hsa_code_symbol_t s = Symbol::ToHandle(sym.get());
         hsa_status_t status = callback(code_object, s, data);
         if (status != HSA_STATUS_SUCCESS) { return status; }
       }
@@ -1144,8 +1139,8 @@ namespace code {
     {
       if (nullptr == img) { return nullptr; }
       if (!section) { section = HsaText(); }
-      symbols.push_back(new KernelSymbol(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
-      return symbols.back();
+      symbols.push_back(std::make_shared<KernelSymbol>(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
+      return symbols.back().get();
     }
 
     Symbol* AmdHsaCode::AddVariableSymbol(const std::string &name,
@@ -1157,8 +1152,8 @@ namespace code {
                                           uint64_t size)
     {
       if (nullptr == img) { return nullptr; }
-      symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
-      return symbols.back();
+      symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
+      return symbols.back().get();
     }
 
     void AmdHsaCode::AddSectionSymbols()
@@ -1166,16 +1161,16 @@ namespace code {
       if (nullptr == img) { return; }
       for (size_t i = 0; i < dataSections.size(); ++i) {
         if (dataSections[i] && dataSections[i]->flags() & SHF_ALLOC) {
-          symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
+          symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
         }
       }
     }
 
     Symbol* AmdHsaCode::GetSymbolByElfIndex(size_t index)
     {
-      for (auto &s : symbols) {
+      for (const auto &s : symbols) {
         if (s && index == s->Index()) {
-          return s;
+          return s.get();
         }
       }
       return nullptr;
@@ -1185,7 +1180,7 @@ namespace code {
     {
       for (auto &s : symbols) {
         if (s && n == s->Name()) {
-          return s;
+          return s.get();
         }
       }
       return nullptr;
@@ -1747,14 +1742,13 @@ namespace code {
       return false;
     }
 
-      AmdHsaCode* AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
+      const std::shared_ptr<AmdHsaCode>& AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
       {
         CodeMap::iterator i = codeMap.find(c.handle);
         if (i == codeMap.end()) {
-          AmdHsaCode* code = new AmdHsaCode();
+          std::shared_ptr<AmdHsaCode> code = std::make_shared<AmdHsaCode>();
           const void* buffer = reinterpret_cast<const void*>(c.handle);
           if (!code->InitAsBuffer(buffer, 0)) {
-            delete code;
             return 0;
           }
           codeMap[c.handle] = code;
@@ -1770,7 +1764,7 @@ namespace code {
           // Currently, we do not always create map entry for every code object buffer.
           return true;
         }
-        delete i->second;
+        i->second.reset();
         codeMap.erase(i);
         return true;
       }
@@ -1798,7 +1792,7 @@ namespace code {
       }
       for (size_t i = 0; i < img->getSymbolTable()->symbolCount(); ++i) {
         amd::elf::Symbol* elfsym = img->getSymbolTable()->symbol(i);
-        Symbol* sym = 0;
+        std::shared_ptr<Symbol> sym;
         switch (elfsym->type()) {
         case STT_AMDGPU_HSA_KERNEL: {
           amd::elf::Section* sec = elfsym->section();
@@ -1815,12 +1809,12 @@ namespace code {
             out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
             return false;
           }
-          sym = new KernelSymbolV2(elfsym, &akc);
+          sym = std::make_shared<KernelSymbolV2>(elfsym, &akc);
           break;
         }
         case STT_OBJECT:
         case STT_COMMON:
-          sym = new VariableSymbolV2(elfsym);
+          sym = std::make_shared<VariableSymbolV2>(elfsym);
           break;
         default:
           break; // Skip unknown symbols.
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp
index 51843f6128..772cd36722 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp
@@ -186,7 +186,6 @@ void Loader::Destroy(Loader *loader)
   _amdgpu_r_debug.r_map = nullptr;
   _amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT;
   r_debug_tail() = nullptr;
-  delete loader;
 }
 
 Executable* AmdHsaCodeLoader::CreateExecutable(
@@ -194,8 +193,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
 {
   WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
 
-  executables.push_back(new ExecutableImpl(profile, context, executables.size(), default_float_rounding_mode));
-  return executables.back();
+  executables.push_back(std::make_shared<ExecutableImpl>(profile, context, executables.size(), default_float_rounding_mode));
+  return executables.back().get();
 }
 
 Executable* AmdHsaCodeLoader::CreateExecutable(
@@ -206,8 +205,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
 {
   WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
 
-  executables.push_back(new ExecutableImpl(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
-  return executables.back();
+  executables.push_back(std::make_shared<ExecutableImpl>(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
+  return executables.back().get();
 }
 
 static void AddCodeObjectInfoIntoDebugMap(link_map* map) {
@@ -254,7 +253,7 @@ hsa_status_t AmdHsaCodeLoader::FreezeExecutable(Executable *executable, const ch
   atomic::Fence(std::memory_order_acq_rel);
   _loader_debug_state();
   atomic::Fence(std::memory_order_acq_rel);
-  for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
+  for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
     AddCodeObjectInfoIntoDebugMap(&(lco->r_debug_info));
   }
   atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
@@ -270,14 +269,13 @@ void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) {
   atomic::Fence(std::memory_order_acq_rel);
   _loader_debug_state();
   atomic::Fence(std::memory_order_acq_rel);
-  for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
+  for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
     RemoveCodeObjectInfoFromDebugMap(&(lco->r_debug_info));
   }
   atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
   _loader_debug_state();
 
-  executables[((ExecutableImpl*)executable)->id()] = nullptr;
-  delete executable;
+  executables[static_cast<ExecutableImpl*>(executable)->id()].reset();
 }
 
 hsa_status_t AmdHsaCodeLoader::IterateExecutables(
@@ -289,9 +287,9 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables(
   WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
   assert(callback);
 
-  for (auto &exec : executables) {
+  for (const auto &exec : executables) {
     if(exec != nullptr){
-      hsa_status_t status = callback(Executable::Handle(exec), data);
+      hsa_status_t status = callback(Executable::Handle(exec.get()), data);
       if (status != HSA_STATUS_SUCCESS) {
         return status;
       }
@@ -318,7 +316,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
   this->EnableReadOnlyMode();
 
   size_t actual_num_segment_descriptors = 0;
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
     if (executable) {
       actual_num_segment_descriptors += executable->GetNumSegmentDescriptors();
     }
@@ -335,7 +333,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
   }
 
   size_t i = 0;
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
     if (executable) {
       i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i);
     }
@@ -352,7 +350,7 @@ uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address)
     return 0;
   }
 
-  for (auto &exec : executables) {
+  for (const auto &exec : executables) {
     if (exec != nullptr) {
       uint64_t host_address = exec->FindHostAddress(device_address);
       if (host_address != 0) {
@@ -371,9 +369,9 @@ void AmdHsaCodeLoader::PrintHelp(std::ostream& out)
 void AmdHsaCodeLoader::EnableReadOnlyMode()
 {
   rw_lock_.ReaderLock();
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
     if (executable) {
-      ((ExecutableImpl*)executable)->EnableReadOnlyMode();
+      ((ExecutableImpl*)executable.get())->EnableReadOnlyMode();
     }
   }
 }
@@ -381,9 +379,9 @@ void AmdHsaCodeLoader::EnableReadOnlyMode()
 void AmdHsaCodeLoader::DisableReadOnlyMode()
 {
   rw_lock_.ReaderUnlock();
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
     if (executable) {
-      ((ExecutableImpl*)executable)->DisableReadOnlyMode();
+      ((ExecutableImpl*)executable.get())->DisableReadOnlyMode();
     }
   }
 }
@@ -781,18 +779,10 @@ ExecutableImpl::ExecutableImpl(
 }
 
 ExecutableImpl::~ExecutableImpl() {
-  for (ExecutableObject* o : objects) {
+  for (const auto& o : objects) {
     o->Destroy();
-    delete o;
   }
   objects.clear();
-
-  for (auto &symbol_entry : program_symbols_) {
-    delete symbol_entry.second;
-  }
-  for (auto &symbol_entry : agent_symbols_) {
-    delete symbol_entry.second;
-  }
 }
 
 hsa_status_t ExecutableImpl::DefineProgramExternalVariable(
@@ -812,7 +802,7 @@ hsa_status_t ExecutableImpl::DefineProgramExternalVariable(
 
   program_symbols_.insert(
     std::make_pair(std::string(name),
-                   new VariableSymbol(true,
+                   std::make_shared<VariableSymbol>(true,
                                       "", // Only program linkage symbols can be
                                           // defined.
                                       std::string(name),
@@ -848,7 +838,7 @@ hsa_status_t ExecutableImpl::DefineAgentExternalVariable(
 
   auto insert_status = agent_symbols_.insert(
     std::make_pair(std::make_pair(std::string(name), agent),
-                   new VariableSymbol(true,
+                   std::make_shared<VariableSymbol>(true,
                                       "", // Only program linkage symbols can be
                                           // defined.
                                       std::string(name),
@@ -896,14 +886,14 @@ Symbol* ExecutableImpl::GetSymbolInternal(
   if (!agent) {
     auto program_symbol = program_symbols_.find(mangled_name);
     if (program_symbol != program_symbols_.end()) {
-      return program_symbol->second;
+      return program_symbol->second.get();
     }
     return nullptr;
   }
 
   auto agent_symbol = agent_symbols_.find(std::make_pair(mangled_name, *agent));
   if (agent_symbol != agent_symbols_.end()) {
-    return agent_symbol->second;
+    return agent_symbol->second.get();
   }
   return nullptr;
 }
@@ -916,14 +906,14 @@ hsa_status_t ExecutableImpl::IterateSymbols(
 
   for (auto &symbol_entry : program_symbols_) {
     hsa_status_t hsc =
-      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
+      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
     if (HSA_STATUS_SUCCESS != hsc) {
       return hsc;
     }
   }
   for (auto &symbol_entry : agent_symbols_) {
     hsa_status_t hsc =
-      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
+      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
     if (HSA_STATUS_SUCCESS != hsc) {
       return hsc;
     }
@@ -948,7 +938,7 @@ hsa_status_t ExecutableImpl::IterateAgentSymbols(
     }
 
     hsa_status_t status = callback(
-        Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second),
+        Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second.get()),
         data);
     if (status != HSA_STATUS_SUCCESS) {
       return status;
@@ -968,7 +958,7 @@ hsa_status_t ExecutableImpl::IterateProgramSymbols(
 
   for (auto &symbol_entry : program_symbols_) {
     hsa_status_t status = callback(
-        Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
+        Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
     if (status != HSA_STATUS_SUCCESS) {
       return status;
     }
@@ -987,10 +977,10 @@ hsa_status_t ExecutableImpl::IterateLoadedCodeObjects(
   ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
   assert(callback);
 
-  for (auto &loaded_code_object : loaded_code_objects) {
+  for (const auto& loaded_code_object : loaded_code_objects) {
     hsa_status_t status = callback(
         Executable::Handle(this),
-        LoadedCodeObject::Handle(loaded_code_object),
+        LoadedCodeObject::Handle(loaded_code_object.get()),
         data);
     if (status != HSA_STATUS_SUCCESS) {
       return status;
@@ -1004,7 +994,7 @@ size_t ExecutableImpl::GetNumSegmentDescriptors()
 {
   // assuming we are in readonly mode.
   size_t actual_num_segment_descriptors = 0;
-  for (auto &obj : loaded_code_objects) {
+  for (const auto &obj : loaded_code_objects) {
     actual_num_segment_descriptors += obj->LoadedSegments().size();
   }
   return actual_num_segment_descriptors;
@@ -1020,7 +1010,7 @@ size_t ExecutableImpl::QuerySegmentDescriptors(
   assert(first_empty_segment_descriptor < total_num_segment_descriptors);
 
   size_t i = first_empty_segment_descriptor;
-  for (auto &obj : loaded_code_objects) {
+  for (const auto &obj : loaded_code_objects) {
     assert(i < total_num_segment_descriptors);
     for (auto &seg : obj->LoadedSegments()) {
       segment_descriptors[i].agent = seg->Agent();
@@ -1084,11 +1074,11 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
     return execHandle;
   }
 
-  for (auto &exec : executables) {
+  for (const auto &exec : executables) {
     if (exec != nullptr) {
       uint64_t host_address = exec->FindHostAddress(device_address);
       if (host_address != 0) {
-        return Executable::Handle(exec);
+        return Executable::Handle(exec.get());
       }
     }
   }
@@ -1098,7 +1088,7 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
 uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address)
 {
   ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
-  for (auto &obj : loaded_code_objects) {
+  for (const auto &obj : loaded_code_objects) {
     assert(obj);
     for (auto &seg : obj->LoadedSegments()) {
       assert(seg);
@@ -1224,7 +1214,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
 
   uint32_t codeNum = NextCodeObjectNum();
 
-  code.reset(new code::AmdHsaCode());
+  code = std::make_unique<code::AmdHsaCode>();
 
   std::string substituteFileName;
   for (const Substitute& ss : substitutes) {
@@ -1306,8 +1296,8 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
 
   hsa_status_t status;
 
-  objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize()));
-  loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back());
+  objects.push_back(std::make_shared<LoadedCodeObjectImpl>(this, agent, code->ElfData(), code->ElfSize()));
+  loaded_code_objects.push_back(std::static_pointer_cast<LoadedCodeObjectImpl>(objects.back()));
 
   status = LoadSegments(agent, code.get(), majorVersion);
   if (status != HSA_STATUS_SUCCESS) return status;
@@ -1338,7 +1328,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
   loaded_code_objects.back()->r_debug_info.l_prev = nullptr;
   loaded_code_objects.back()->r_debug_info.l_next = nullptr;
 
-  if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back()); }
+  if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back().get()); }
   return HSA_STATUS_SUCCESS;
 }
 
@@ -1376,18 +1366,18 @@ hsa_status_t ExecutableImpl::LoadSegmentsV2(hsa_agent_t agent,
       AMD_ISA_ALIGN_BYTES, true);
   if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
 
-  Segment *load_segment = new Segment(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
+  std::shared_ptr<Segment> load_segment = std::make_shared<Segment>(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
       ptr, size, vaddr, c->DataSegment(0)->offset());
   if (!load_segment) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
 
   hsa_status_t status = HSA_STATUS_SUCCESS;
   for (size_t i = 0; i < c->DataSegmentCount(); ++i) {
-    status = LoadSegmentV2(c->DataSegment(i), load_segment);
+    status = LoadSegmentV2(c->DataSegment(i), load_segment.get());
     if (status != HSA_STATUS_SUCCESS) return status;
   }
 
   objects.push_back(load_segment);
-  loaded_code_objects.back()->LoadedSegments().push_back(load_segment);
+  loaded_code_objects.back()->LoadedSegments().push_back(load_segment.get());
 
   return HSA_STATUS_SUCCESS;
 }
@@ -1398,7 +1388,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
   if (s->memSize() == 0)
     return HSA_STATUS_SUCCESS;
   amdgpu_hsa_elf_segment_t segment = (amdgpu_hsa_elf_segment_t)(s->type() - PT_LOOS);
-  Segment *new_seg = nullptr;
+  std::shared_ptr<Segment> new_seg;
   bool need_alloc = true;
   if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM && nullptr != program_allocation_segment) {
     new_seg = program_allocation_segment;
@@ -1407,7 +1397,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
   if (need_alloc) {
     void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true);
     if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
-    new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
+    new_seg = std::make_shared<Segment>(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
     new_seg->Copy(s->vaddr(), s->data(), s->imageSize());
     objects.push_back(new_seg);
 
@@ -1416,7 +1406,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
     }
   }
   assert(new_seg);
-  loaded_code_objects.back()->LoadedSegments().push_back(new_seg);
+  loaded_code_objects.back()->LoadedSegments().push_back(new_seg.get());
   return HSA_STATUS_SUCCESS;
 }
 
@@ -1471,7 +1461,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
   }
 
   uint64_t address = SymbolAddress(agent, sym);
-  SymbolImpl *symbol = nullptr;
+  std::shared_ptr<SymbolImpl> symbol;
   if (string_ends_with(sym->GetSymbolName(), ".kd")) {
     // V3.
     llvm::amdhsa::kernel_descriptor_t kd;
@@ -1486,7 +1476,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
 
     uint64_t size = sym->Size();
 
-    KernelSymbol *kernel_symbol = new KernelSymbol(true,
+    std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
                                     sym->GetModuleName(),
                                     sym->GetSymbolName(),
                                     sym->Linkage(),
@@ -1502,7 +1492,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
                                     address);
     symbol = kernel_symbol;
   } else if (sym->IsVariableSymbol()) {
-    symbol = new VariableSymbol(true,
+    symbol = std::make_shared<VariableSymbol>(true,
                        sym->GetModuleName(),
                        sym->GetSymbolName(),
                        sym->Linkage(),
@@ -1537,7 +1527,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
         // calculate end of segment - symbol value.
         size = sym->GetSection()->size() - sym->SectionOffset();
       }
-      KernelSymbol *kernel_symbol = new KernelSymbol(true,
+      std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
                                       sym->GetModuleName(),
                                       sym->GetSymbolName(),
                                       sym->Linkage(),
@@ -1970,7 +1960,7 @@ void ExecutableImpl::Print(std::ostream& out)
       << std::endl << std::endl;
   out << "Loaded Objects (total " << objects.size() << ")" << std::endl;
   size_t i = 0;
-  for (ExecutableObject* o : objects) {
+  for (const auto& o : objects) {
     out << "Loaded Object " << i++ << ": ";
     o->Print(out);
     out << std::endl;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp
index 2cd9bdb9d7..9d8a238fb1 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp
@@ -461,7 +461,7 @@ public:
 };
 
 typedef std::string ProgramSymbol;
-typedef std::unordered_map<ProgramSymbol, SymbolImpl*> ProgramSymbolMap;
+typedef std::unordered_map<ProgramSymbol, std::shared_ptr<SymbolImpl>> ProgramSymbolMap;
 
 typedef std::pair<std::string, hsa_agent_t> AgentSymbol;
 struct ASC {
@@ -476,7 +476,7 @@ struct ASH {
     return h ^ (i << 1);
   }
 };
-typedef std::unordered_map<AgentSymbol, SymbolImpl*, ASH, ASC> AgentSymbolMap;
+typedef std::unordered_map<AgentSymbol, std::shared_ptr<SymbolImpl>, ASH, ASC> AgentSymbolMap;
 
 class ExecutableImpl final: public Executable {
 friend class AmdHsaCodeLoader;
@@ -634,15 +634,15 @@ private:
 
   ProgramSymbolMap program_symbols_;
   AgentSymbolMap agent_symbols_;
-  std::vector<ExecutableObject*> objects;
-  Segment *program_allocation_segment;
-  std::vector<LoadedCodeObjectImpl*> loaded_code_objects;
+  std::vector<std::shared_ptr<ExecutableObject>> objects;
+  std::shared_ptr<Segment> program_allocation_segment;
+  std::vector<std::shared_ptr<LoadedCodeObjectImpl>> loaded_code_objects;
 };
 
 class AmdHsaCodeLoader : public Loader {
 private:
   Context* context;
-  std::vector<Executable*> executables;
+  std::vector<std::shared_ptr<Executable>> executables;
   amd::hsa::common::ReaderWriterLock rw_lock_;
 
 public:
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp
index 8931aa9f92..7eb0621d65 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp
@@ -282,7 +282,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
     size_t interval, size_t latency, size_t buffer_size,
     hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data,
     hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
 
   handle->handle = ++pc_sampling_id_;
   // create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size,
@@ -305,7 +305,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
 }
 
 hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
   auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
   if (pcSamplingSessionIt == pc_sampling_.end()) {
     debug_warning(false && "Cannot find PcSampling session");
@@ -319,7 +319,7 @@ hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
 }
 
 hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
   auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
   if (pcSamplingSessionIt == pc_sampling_.end()) {
     debug_warning(false && "Cannot find PcSampling session");
@@ -331,7 +331,7 @@ hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
 }
 
 hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
   auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
   if (pcSamplingSessionIt == pc_sampling_.end()) {
     debug_warning(false && "Cannot find PcSampling session");
@@ -343,7 +343,7 @@ hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
 }
 
 hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
   auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
   if (pcSamplingSessionIt == pc_sampling_.end()) {
     debug_warning(false && "Cannot find PcSampling session");
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h
index 3547693e8d..72a5ad4480 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h
@@ -166,7 +166,7 @@ class PcsRuntime {
 }
   // Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle
   std::map<uint64_t, PcSamplingSession> pc_sampling_;
-  KernelMutex pc_sampling_lock_;
+  std::mutex pc_sampling_lock_;
   uint64_t pc_sampling_id_;
 
   DISALLOW_COPY_AND_ASSIGN(PcsRuntime);