SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers (#2146)

* SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers * Remove KernelMutex and KernelSharedMutex abstractions with std::mutex and std::shared_mutex * Replaced unique_locks with lock_guards * More changes * Replace new and deletes with smart pointers * Replaced some more with shared ptrs * Replacements with smart pointers - pt 2 * missed change
2026-01-06 10:59:34 -05:00
commit 637b0d71f0
@@ -47,6 +47,7 @@

 #include <assert.h>
 #include <vector>
+#include <mutex>

 #include "core/inc/checked.h"
 #include "core/inc/isa.h"
@@ -291,7 +292,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
                               void* value) const = 0;

  // @brief Returns an array of regions owned by the agent.
-  virtual const std::vector<const core::MemoryRegion*>& regions() const = 0;
+  virtual const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const = 0;

  // @brief Returns the ISA's supported by the agent.
  // @details The returned vector is a list of pointers to the supported ISA,
@@ -336,7 +337,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
  __forceinline void Disable() { enabled_ = false; }

  virtual void Trim() {
-    for (auto region : regions()) region->Trim();
+    for (const auto& region : regions()) region.get()->Trim();
  }

  virtual void ReleaseResources() { }
@@ -385,7 +386,7 @@ protected:
  // Serial memory operations are needed to ensure, among other things, that allocation failures are
  // due to true OOM conditions and per region caching (Trim and Allocate must be serial and
  // exclusive to ensure this).
-  KernelMutex agent_memory_lock_;
+  std::mutex agent_memory_lock_;

  // Forbid copying and moving of this object
  DISALLOW_COPY_AND_ASSIGN(Agent);
@@ -82,7 +82,7 @@ public:
 /// @brief Override from core::Agent.
 const std::vector<const core::Isa*>& supported_isas() const override { return supported_isas_; }

- const std::vector<const core::MemoryRegion*>& regions() const override { return regions_; }
+ const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override { return regions_; }

 /// @brief Getter for the AIE system allocator.
 const std::function<void*(size_t size, size_t align, core::MemoryRegion::AllocateFlags flags)>&
@@ -101,7 +101,7 @@ private:
  /// @brief Setup the memory allocators used by this agent.
  void InitAllocators();

-  std::vector<const core::MemoryRegion *> regions_;
+  std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;
  std::function<void *(size_t size, size_t align,
                       core::MemoryRegion::AllocateFlags flags)>
      system_allocator_;
@@ -306,7 +306,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  // GPU-visible indirect buffer holding PM4 commands.
  void* pm4_ib_buf_;
  uint32_t pm4_ib_size_b_;
-  KernelMutex pm4_ib_mutex_;
+  std::mutex pm4_ib_mutex_;

  // Error handler control variable.
  std::atomic<uint32_t> dynamicScratchState, exceptionState;
@@ -322,11 +322,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  Signal* exception_signal_;

  // CU mask lock
-  KernelMutex mask_lock_;
+  std::mutex mask_lock_;

  // Mutex to prevent AsyncReclaimScratch and HandleInsufficientScratch from
  // happening at the same time.
-  KernelMutex scratch_lock_;
+  std::mutex scratch_lock_;

  // Current CU mask
  std::vector<uint32_t> cu_mask_;
@@ -345,10 +345,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  }

  // Mutex for queue_event_ manipulation
-KernelMutex& queue_lock() {
+std::mutex& queue_lock() {
  // This allocation is meant to last until the last thread has exited.
  // It is intentionally not freed.
-  static KernelMutex* queue_lock_ = new KernelMutex();
+  static std::mutex* queue_lock_ = new std::mutex();
  return *queue_lock_;
 }

@@ -255,7 +255,7 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {

  // Internal signals for blocking APIs
  core::unique_signal_ptr signals_[2];
-  KernelMutex lock_;
+  std::mutex lock_;
  bool parity_;

  /// Queue resource descriptor for doorbell, read
@@ -127,7 +127,7 @@ class CpuAgent : public core::Agent {
  }

  // @brief Override from core::Agent.
-  const std::vector<const core::MemoryRegion*>& regions() const override {
+  const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
    return regions_;
  }

@@ -151,7 +151,7 @@ class CpuAgent : public core::Agent {
  // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
  // region returns ::HSA_STATUS_SUCCESS.
  hsa_status_t VisitRegion(
-      const std::vector<const core::MemoryRegion*>& regions,
+      const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
      hsa_status_t (*callback)(hsa_region_t region, void* data),
      void* data) const;

@@ -166,7 +166,7 @@ class CpuAgent : public core::Agent {
  std::vector<std::unique_ptr<core::Cache>> caches_;

  // @brief Array of regions owned by this agent.
-  std::vector<const core::MemoryRegion*> regions_;
+  std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;

  DISALLOW_COPY_AND_ASSIGN(CpuAgent);
 };
@@ -394,7 +394,7 @@ class GpuAgent : public GpuAgentInt {
  }

  // @brief Override from core::Agent.
-  const std::vector<const core::MemoryRegion*>& regions() const override {
+  const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions() const override {
    return regions_;
  }

@@ -536,7 +536,7 @@ class GpuAgent : public GpuAgentInt {
  // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
  // region returns ::HSA_STATUS_SUCCESS.
  hsa_status_t VisitRegion(
-      const std::vector<const core::MemoryRegion*>& regions,
+      const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
      hsa_status_t (*callback)(hsa_region_t region, void* data),
      void* data) const;

@@ -594,7 +594,7 @@ class GpuAgent : public GpuAgentInt {
  std::vector<const core::Agent*> xgmi_peer_list_;

  // Protects xgmi_peer_list_
-  KernelMutex xgmi_peer_list_lock_;
+  std::mutex xgmi_peer_list_lock_;

  // @brief AQL queues for cache management and blit compute usage.
  enum QueueEnum {
@@ -607,19 +607,19 @@ class GpuAgent : public GpuAgentInt {
  lazy_ptr<core::Queue> queues_[QueueCount];

  // @brief Mutex to protect the update to coherency type.
-  KernelMutex coherency_lock_;
+  std::mutex coherency_lock_;

  // @brief Mutex to protect access to scratch pool.
-  KernelMutex scratch_lock_;
+  std::mutex scratch_lock_;

  // @brief Mutex to protect access to ::t1_.
-  KernelMutex t1_lock_;
+  std::mutex t1_lock_;

  // @brief Mutex to protect access to blit objects.
-  KernelMutex blit_lock_;
+  std::mutex blit_lock_;

  // @brief Mutex to protect sdma gang submissions.
-  KernelMutex sdma_gang_lock_;
+  std::mutex sdma_gang_lock_;

  // @brief GPU tick on initialization.
  HsaClockCounters t0_;
@@ -638,7 +638,7 @@ class GpuAgent : public GpuAgentInt {
  std::vector<std::unique_ptr<core::Cache>> caches_;

  // @brief Array of regions owned by this agent.
-  std::vector<const core::MemoryRegion*> regions_;
+  std::vector<std::shared_ptr<const core::MemoryRegion>> regions_;

  core::Isa* isa_;

@@ -729,7 +729,7 @@ class GpuAgent : public GpuAgentInt {
  struct {
    lazy_ptr<core::Queue> queue_;
    int ref_ct_;
-    KernelMutex lock_;
+    std::mutex lock_;
  } gws_queue_;

  // @brief list of AQL queues owned by this agent. Indexed by queue pointer
@@ -763,7 +763,7 @@ class GpuAgent : public GpuAgentInt {
  /// @brief Coarse-grain deallocator on this GPU.
  std::function<void(void*)> coarsegrain_deallocator_;

-  void* trap_handler_tma_region_;
+  std::unique_ptr<void, std::function<void(void*)>> trap_handler_tma_region_;

  /* PC Sampling fields - begin */
  /* 2nd level Trap handler code is based on the offsets within this structure */
@@ -181,7 +181,7 @@ namespace code {
      std::vector<Segment*> dataSegments;
      std::vector<Section*> dataSections;
      std::vector<RelocationSection*> relocationSections;
-      std::vector<Symbol*> symbols;
+      std::vector<std::shared_ptr<Symbol>> symbols;
      bool combineDataSegments;
      Segment* hsaSegments[AMDGPU_HSA_SEGMENT_LAST][2];
      Section* hsaSections[AMDGPU_HSA_SECTION_LAST];
@@ -234,7 +234,7 @@ namespace code {
      uint32_t OsAbi() const { return img->OsAbi(); }

      AmdHsaCode(bool combineDataSegments = true);
-      virtual ~AmdHsaCode();
+      virtual ~AmdHsaCode() = default;

      std::string output() { return out.str(); }
      bool LoadFromFile(const std::string& filename);
@@ -347,7 +347,7 @@ namespace code {
      RelocationSection* GetRelocationSection(size_t i) { return relocationSections[i]; }

      size_t SymbolCount() { return symbols.size(); }
-      Symbol* GetSymbol(size_t i) { return symbols[i]; }
+      Symbol* GetSymbol(size_t i) { return symbols[i].get(); }
      Symbol* GetSymbolByElfIndex(size_t index);
      Symbol* FindSymbol(const std::string &n);

@@ -362,11 +362,11 @@ namespace code {

    class AmdHsaCodeManager {
    private:
-      typedef std::unordered_map<uint64_t, AmdHsaCode*> CodeMap;
+      typedef std::unordered_map<uint64_t, std::shared_ptr<AmdHsaCode>> CodeMap;
      CodeMap codeMap;

    public:
-      AmdHsaCode* FromHandle(hsa_code_object_t handle);
+      const std::shared_ptr<AmdHsaCode>& FromHandle(hsa_code_object_t handle);
      bool Destroy(hsa_code_object_t handle);
    };

@@ -422,7 +422,7 @@ private:
  Executable(const Executable &e);
  Executable& operator=(const Executable &e);

-  static std::vector<Executable*> executables;
+  static std::vector<std::shared_ptr<Executable>> executables;
  static std::mutex executables_mutex;
 };

@@ -187,7 +187,7 @@ private:

  // Protects against concurrent allow_access calls to fragments of the same block by virtue of all
  // fragments of the block routing to the same MemoryRegion.
-  mutable KernelMutex access_lock_;
+  mutable std::mutex access_lock_;

  static const size_t kPageSize_;

@@ -216,7 +216,7 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi

 private:
  // Serialize packet interception processing.
-  KernelMutex lock_;
+  std::mutex lock_;

  // Largest processed packet index.
  uint64_t next_packet_;
@@ -103,7 +103,7 @@ class IPCSignal : private SharedMemorySignal, public BusyWaitSignal {
    static int rtti_id_ = 0;
      return rtti_id_;
  }
-  static KernelMutex lock_;
+  static std::mutex lock_;

  explicit IPCSignal(SharedMemorySignal&& abi_block)
      : SharedMemorySignal(std::move(abi_block)), BusyWaitSignal(signal(), true) {}
@@ -51,6 +51,7 @@
 #include <tuple>
 #include <utility>
 #include <thread>
+#include <shared_mutex>
 #if defined(__linux__)
 #include <sys/un.h>
 #include <xf86drm.h>
@@ -437,15 +438,15 @@ class Runtime {

  Agent* region_gpu() { return region_gpu_; }

-  const std::vector<const MemoryRegion*>& system_regions_fine() const {
+  const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_fine() const {
    return system_regions_fine_;
  }

-  const std::vector<const MemoryRegion*>& system_regions_coarse() const {
+  const std::vector<std::shared_ptr<const MemoryRegion>>& system_regions_coarse() const {
    return system_regions_coarse_;
  }

-  amd::hsa::loader::Loader* loader() { return loader_; }
+  amd::hsa::loader::Loader* loader() { return loader_.get(); }

  amd::LoaderContext* loader_context() { return &loader_context_; }

@@ -719,10 +720,10 @@ class Runtime {

  // Will be created before any user could call hsa_init but also could be
  // destroyed before incorrectly written programs call hsa_shutdown.
-  static __forceinline KernelMutex& bootstrap_lock() {
+  static __forceinline std::mutex& bootstrap_lock() {
    // This allocation is meant to last until the last thread has exited.
    // It is intentionally not freed.
-    static KernelMutex* bootstrap_lock_ = new KernelMutex;
+    static std::mutex* bootstrap_lock_ = new std::mutex;
    return *bootstrap_lock_;
  }
  Runtime();
@@ -780,7 +781,7 @@ class Runtime {
  // Also ensures atomicity of pointer info queries by interlocking
  // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
  // registered & mapped arrays.
-  KernelSharedMutex memory_lock_;
+  std::shared_mutex memory_lock_;

  // Array containing driver interfaces for compatible agent kernel-mode
  // drivers. Currently supports AIE agents.
@@ -811,16 +812,16 @@ class Runtime {
  std::vector<uint32_t> gpu_ids_;

  // List of all fine grain system memory region in the platform.
-  std::vector<const MemoryRegion*> system_regions_fine_;
+  std::vector<std::shared_ptr<const MemoryRegion>> system_regions_fine_;

  // List of all coarse grain system memory region in the platform.
-  std::vector<const MemoryRegion*> system_regions_coarse_;
+  std::vector<std::shared_ptr<const MemoryRegion>> system_regions_coarse_;

  // Matrix of IO link.
  std::vector<LinkInfo> link_matrix_;

  // Loader instance.
-  amd::hsa::loader::Loader* loader_;
+  std::unique_ptr<amd::hsa::loader::Loader> loader_;

  // Loader context.
  amd::LoaderContext loader_context_;
@@ -832,7 +833,7 @@ class Runtime {
  std::map<const void*, AllocationRegion> allocation_map_;

  // Pending prefetch containers.
-  KernelMutex prefetch_lock_;
+  std::mutex prefetch_lock_;
  prefetch_map_t prefetch_map_;

  // Allocator using ::system_region_
@@ -853,24 +854,29 @@ class Runtime {
  // Number of Numa Nodes
  size_t num_nodes_;

+  struct HsaEventDeleter {
+    void operator()(HsaEvent* event) { InterruptSignal::DestroyEvent(event); }
+  };
+  using unique_hsa_event_ptr = std::unique_ptr<HsaEvent, HsaEventDeleter>;
+
  // @brief AMD HSA event to monitor for virtual memory access fault.
-  HsaEvent* vm_fault_event_;
+  unique_hsa_event_ptr vm_fault_event_;

  // @brief HSA signal to contain the VM fault event.
-  Signal* vm_fault_signal_;
+  unique_signal_ptr vm_fault_signal_;

  // @brief AMD HSA event to monitor for HW exceptions.
-  HsaEvent* hw_exception_event_;
+  unique_hsa_event_ptr hw_exception_event_;

  // @brief HSA signal to contain the HW exceptionevent.
-  Signal* hw_exception_signal_;
+  unique_signal_ptr hw_exception_signal_;

  // Custom system event handlers.
  std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
      system_event_handlers_;

  // System event handler lock
-  KernelMutex system_event_lock_;
+  std::mutex system_event_lock_;

  // Internal queue creation notifier
  AMD::callback_t<hsa_amd_runtime_queue_notifier> internal_queue_create_notifier_;
@@ -898,8 +904,8 @@ class Runtime {

  // IPC DMA buf unix domain socket server dmabuf FD passing
  int ipc_sock_server_fd_;
-  std::map<uint64_t, size_t> ipc_sock_server_conns_;
-  KernelMutex ipc_sock_server_lock_;
+  std::map<uint64_t, int> ipc_sock_server_conns_;
+  std::mutex ipc_sock_server_lock_;

 private:
  void CheckVirtualMemApiSupport();
@@ -50,6 +50,7 @@
 #include <memory>
 #include <vector>
 #include <utility>
+#include <mutex>

 #include "hsakmt/hsakmt.h"

@@ -499,7 +500,7 @@ class Signal {
  core::Agent* async_copy_agent_;

 private:
-  static KernelMutex ipcLock_;
+  static std::mutex ipcLock_;
  static std::map<decltype(hsa_signal_t::handle), Signal*> ipcMap_;

  static Signal* lookupIpc(hsa_signal_t signal);
@@ -66,7 +66,6 @@ AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props)
 }

 AieAgent::~AieAgent() {
-  std::for_each(regions_.begin(), regions_.end(), DeleteObject());
  regions_.clear();
 }

@@ -75,8 +74,8 @@ hsa_status_t AieAgent::VisitRegion(bool include_peer,
                                                            void *data),
                                   void *data) const {
  AMD::callback_t<decltype(callback)> call(callback);
-  for (const auto r : regions_) {
-    hsa_region_t region_handle(core::MemoryRegion::Convert(r));
+  for (const auto& r : regions_) {
+    hsa_region_t region_handle(core::MemoryRegion::Convert(r.get()));
    hsa_status_t err = call(region_handle, data);
    if (err != HSA_STATUS_SUCCESS) {
      return err;
@@ -321,24 +320,25 @@ void AieAgent::InitRegionList() {
  /// explicit sync operations.
  regions_.reserve(3);
  regions_.push_back(
-      new MemoryRegion(false, true, false, false, true, this, sys_mem_props));
+    std::make_shared<MemoryRegion>(false, true, false, false, true, this, sys_mem_props));
  regions_.push_back(
-      new MemoryRegion(false, false, false, false, true, this, dev_mem_props));
-  regions_.push_back(new MemoryRegion(false, false, false, false, true, this,
-                                      other_mem_props));
+    std::make_shared<MemoryRegion>(false, false, false, false, true, this, dev_mem_props));
+  regions_.push_back(
+    std::make_shared<MemoryRegion>(false, false, false, false, true, this, other_mem_props));
 }

 void AieAgent::InitAllocators() {
-  for (const auto *region : regions()) {
+  for (const auto& region : regions()) {
    const MemoryRegion *amd_mem_region(
-        static_cast<const MemoryRegion *>(region));
+        static_cast<const MemoryRegion *>(region.get()));
    if (amd_mem_region->kernarg()) {
+      const core::MemoryRegion* region_ptr = region.get();
      system_allocator_ =
-          [region](size_t size, size_t align,
+          [region_ptr](size_t size, size_t align,
                   core::MemoryRegion::AllocateFlags alloc_flags) -> void * {
        void *mem(nullptr);
        return (core::Runtime::runtime_singleton_->AllocateMemory(
-                    region, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
+                    region_ptr, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
                   ? mem
                   : nullptr;
      };
@@ -165,8 +165,8 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
  // Set group and private memory apertures in amd_queue_.
  auto& regions = agent->regions();

-  for (auto region : regions) {
-    const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region);
+  for (const auto& region : regions) {
+    const MemoryRegion* amdregion = static_cast<const AMD::MemoryRegion*>(region.get());
    uint64_t base = amdregion->GetBaseAddress();

    if (amdregion->IsLDS()) {
@@ -217,7 +217,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
  }

  MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() {
-    ScopedAcquire<KernelMutex> _lock(&queue_lock());
+    std::lock_guard<std::mutex> _lock(queue_lock());
    queue_count()--;
    if (queue_count() == 0) {
      core::InterruptSignal::DestroyEvent(queue_event());
@@ -232,7 +232,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
  });

  if (core::g_use_interrupt_wait) {
-    ScopedAcquire<KernelMutex> _lock(&queue_lock());
+    std::lock_guard<std::mutex> _lock(queue_lock());
    queue_count()++;
    if (queue_event() == nullptr) {
      assert(queue_count() == 1 && "Inconsistency in queue event reference counting found.\n");
@@ -387,7 +387,7 @@ AqlQueue::~AqlQueue() {
  FreeQueueMemory();

  if (core::g_use_interrupt_wait) {
-    ScopedAcquire<KernelMutex> lock(&queue_lock());
+    std::lock_guard<std::mutex> lock(queue_lock());
    queue_count()--;
    if (queue_count() == 0) {
      core::InterruptSignal::DestroyEvent(queue_event());
@@ -777,7 +777,7 @@ void AqlQueue::AsyncReclaimMainScratch() {
  tool::notify_event_scratch_async_reclaim_start(public_handle(),
                                                 HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE);

-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);

  // Unmap the queue. CP will check amd_queue_ fields on re-map
  Suspend();
@@ -849,7 +849,7 @@ void AqlQueue::AsyncReclaimAltScratch() {
  tool::notify_event_scratch_async_reclaim_start(public_handle(),
                                                 HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT);

-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);

  // Unmap the queue. CP will check amd_queue_ fields on re-map
  Suspend();
@@ -1014,7 +1014,7 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
  const uint64_t device_size = size_per_thread * lanes_per_wave * device_slots;
  const uint64_t dispatch_size = size_per_thread * lanes_per_wave * dispatch_slots;

-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);

  // scratch.use_alt_limit will be 0 if alt scratch is not supported or disabled
  if (dispatch_size < scratch.use_alt_limit && dispatch_slots < device_slots) {
@@ -1393,7 +1393,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
  if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask;

  // Apply mask if non-default or not queue initialization.
-  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  std::lock_guard<std::mutex> lock(mask_lock_);
  if ((!cu_mask_.empty()) || (num_cu_mask_count != 0) || (!global_mask.empty())) {

    // Devices with WGPs must conform to even-indexed contiguous pairwise CU enablement.
@@ -1414,7 +1414,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
 }

 hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) {
-  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  std::lock_guard<std::mutex> lock(mask_lock_);
  assert(!cu_mask_.empty() && "No current cu_mask!");

  uint32_t user_dword_count = num_cu_mask_count / 32;
@@ -1440,7 +1440,7 @@ void AqlQueue::SetProfiling(bool enabled) {
 void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope_t acquireFence,
                          hsa_fence_scope_t releaseFence, hsa_signal_t* in_signal) {
  // pm4_ib_buf_ is a shared resource, so mutually exclude here.
-  ScopedAcquire<KernelMutex> lock(&pm4_ib_mutex_);
+  std::lock_guard<std::mutex> lock(pm4_ib_mutex_);

  // Obtain reference to any container queue.
  core::Queue* queue = core::Queue::Convert(public_handle());
@@ -293,7 +293,7 @@ static bool DepSignalCompleteHandler(hsa_signal_value_t signal_value, void *arg
 template <bool useGCR>
 hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd_size,
                                                     uint64_t size) {
-  ScopedAcquire<KernelMutex> lock(&lock_);
+  std::unique_lock<std::mutex> lock(lock_);

  // Alternate between completion signals
  // Using two allows overlapping command writing and copies
@@ -310,7 +310,7 @@ hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd
  // Mark signal as in use, guard against exception leaving the signal in an unusable state.
  completionSignal->StoreRelaxed(2);
  MAKE_SCOPE_GUARD([&]() { completionSignal->StoreRelaxed(0); });
-  lock.Release();
+  lock.unlock();

  std::vector<core::Signal*> gang_signals(0);

@@ -64,7 +64,6 @@ CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props,
 }

 CpuAgent::~CpuAgent() {
-  std::for_each(regions_.begin(), regions_.end(), DeleteObject());
  regions_.clear();
 }

@@ -87,17 +86,17 @@ void CpuAgent::InitRegionList() {
    if (system_prop != mem_props.end()) system_props = *system_prop;

    // Fine-Grain Memory
-    regions_.push_back(new MemoryRegion(true, false, is_apu_node, false, true, this, system_props));
+    regions_.push_back(std::make_shared<MemoryRegion>(true, false, is_apu_node, false, true, this, system_props));

    // Ext-Fine-Grain Memory
-    regions_.push_back(new MemoryRegion(false, false, is_apu_node, true, true, this, system_props));
+    regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, true, true, this, system_props));

    // Kernargs
-    regions_.push_back(new MemoryRegion(true, true, is_apu_node, false, true, this, system_props));
+    regions_.push_back(std::make_shared<MemoryRegion>(true, true, is_apu_node, false, true, this, system_props));

    if (!is_apu_node) {
      // Coarse Grain
-      regions_.push_back(new MemoryRegion(false, false, is_apu_node, false, true, this, system_props));
+      regions_.push_back(std::make_shared<MemoryRegion>(false, false, is_apu_node, false, true, this, system_props));
    }
  }
 }
@@ -150,12 +149,12 @@ hsa_status_t CpuAgent::VisitRegion(bool include_peer,
 }

 hsa_status_t CpuAgent::VisitRegion(
-    const std::vector<const core::MemoryRegion*>& regions,
+    const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
    hsa_status_t (*callback)(hsa_region_t region, void* data),
    void* data) const {
-  for (const core::MemoryRegion* region : regions) {
+  for (const std::shared_ptr<const rocr::core::MemoryRegion>& region : regions) {
    if (!region->user_visible()) continue;
-    hsa_region_t region_handle = core::MemoryRegion::Convert(region);
+    hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
    hsa_status_t status = callback(region_handle, data);
    if (status != HSA_STATUS_SUCCESS) {
      return status;
@@ -112,7 +112,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
      scratch_limit_async_threshold_(0),
      scratch_cache_(
          [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
-      trap_handler_tma_region_(NULL),
+      trap_handler_tma_region_(nullptr, [this](void* ptr){
+        if (ptr && this->finegrain_allocator_) this->finegrain_deallocator()(ptr);
+      }),
      rec_sdma_eng_override_(false),
      pcs_hosttrap_data_(),
      pcs_stochastic_data_(),
@@ -246,7 +248,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
 GpuAgent::~GpuAgent() {
  for (auto& blit : blits_) blit.reset();

-  std::for_each(regions_.begin(), regions_.end(), DeleteObject());
  regions_.clear();
 }

@@ -454,22 +455,20 @@ void GpuAgent::InitRegionList() {
          memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
        case HSA_HEAPTYPE_GPU_LDS:
        case HSA_HEAPTYPE_GPU_SCRATCH: {
-          MemoryRegion* region =
-              new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]);
-
+          std::shared_ptr<MemoryRegion> region = std::make_shared<MemoryRegion>(false, false, false, false, true, this, mem_props[mem_idx]);
          regions_.push_back(region);

          if (region->IsLocalMemory()) {
            // Extended Fine-Grain memory
            if (!(isa_->GetMajorVersion() == 12 && isa_->GetMinorVersion() == 0))
              regions_.push_back(
-                  new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx]));
+                  std::make_shared<MemoryRegion>(false, false, false, true, true, this, mem_props[mem_idx]));

            // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
            bool user_visible = (properties_.HiveID != 0) ||
                core::Runtime::runtime_singleton_->flag().fine_grain_pcie();

-            regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this,
+            regions_.push_back(std::make_shared<MemoryRegion>(true, false, false, false, user_visible, this,
                                                mem_props[mem_idx]));
          }
          break;
@@ -561,7 +560,7 @@ void GpuAgent::ReserveScratch()
  size_t available;
  hsa_status_t err = driver().AvailableMemory(node_id(), &available);
  assert(err == HSA_STATUS_SUCCESS && "AvailableMemory failed");
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
  if (!scratch_cache_.reserved_bytes() && reserved_sz && available > 8 * reserved_sz) {
    HSAuint64 alt_va;
    void* reserved_base = scratch_pool_.alloc(reserved_sz);
@@ -676,20 +675,20 @@ hsa_status_t GpuAgent::VisitRegion(bool include_peer,
 }

 hsa_status_t GpuAgent::VisitRegion(
-    const std::vector<const core::MemoryRegion*>& regions,
+    const std::vector<std::shared_ptr<const core::MemoryRegion>>& regions,
    hsa_status_t (*callback)(hsa_region_t region, void* data),
    void* data) const {
  AMD::callback_t<decltype(callback)> call(callback);
-  for (const core::MemoryRegion* region : regions) {
+  for (const auto& region : regions) {
    if (!region->user_visible()) continue;

    const AMD::MemoryRegion* amd_region =
-        reinterpret_cast<const AMD::MemoryRegion*>(region);
+        reinterpret_cast<const AMD::MemoryRegion*>(region.get());

    // Only expose system, local, and LDS memory.
    if (amd_region->IsSystem() || amd_region->IsLocalMemory() ||
        amd_region->IsLDS()) {
-      hsa_region_t region_handle = core::MemoryRegion::Convert(region);
+      hsa_region_t region_handle = core::MemoryRegion::Convert(region.get());
      hsa_status_t status = call(region_handle, data);
      if (status != HSA_STATUS_SUCCESS) {
        return status;
@@ -910,7 +909,7 @@ void GpuAgent::InitGWS() {
 }

 void GpuAgent::GWSRelease() {
-  ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
+  std::lock_guard<std::mutex> lock(gws_queue_.lock_);
  gws_queue_.ref_ct_--;
  if (gws_queue_.ref_ct_ != 0) return;
  InitGWS();
@@ -968,22 +967,22 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
 }

 void GpuAgent::SetCopyRequestRefCount(bool set) {
-  ScopedAcquire<KernelMutex> lock(&blit_lock_);
+  std::unique_lock<std::mutex> lock(blit_lock_);
  while (pending_copy_stat_check_ref_) {
-    blit_lock_.Release();
+    lock.unlock();
    os::YieldThread();
-    blit_lock_.Acquire();
+    lock.lock();
  }
  if (!set && pending_copy_req_ref_) pending_copy_req_ref_--;
  else pending_copy_req_ref_++;
 }

 void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
-  ScopedAcquire<KernelMutex> lock(&blit_lock_);
+  std::unique_lock<std::mutex> lock(blit_lock_);
  while (pending_copy_req_ref_) {
-    blit_lock_.Release();
+    lock.unlock();
    os::YieldThread();
-    blit_lock_.Acquire();
+    lock.lock();
  }
  if (!set && pending_copy_stat_check_ref_) pending_copy_stat_check_ref_--;
  else pending_copy_stat_check_ref_++;
@@ -1059,7 +1058,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                      std::min(gang_factor, properties_.NumSdmaXgmiEngines);
  }

-  ScopedAcquire<KernelMutex> lock(&sdma_gang_lock_);
+  std::lock_guard<std::mutex> lock(sdma_gang_lock_);
  // Manage internal gang signals
  std::vector<core::Signal*> gang_signals;
  if (gang_factor > 1) {
@@ -1642,7 +1641,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {

      if (status != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT;

-      for (auto r : regions()) availableBytes += ((AMD::MemoryRegion*)r)->GetCacheSize();
+      for (const auto& r : regions()) availableBytes += ((AMD::MemoryRegion*)(r.get()))->GetCacheSize();

      availableBytes += scratch_cache_.free_bytes() - scratch_cache_.reserved_bytes();

@@ -1730,7 +1729,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
                                   core::Queue** queue) {
  // Handle GWS queues.
  if (queue_type == HSA_QUEUE_TYPE_COOPERATIVE) {
-    ScopedAcquire<KernelMutex> lock(&gws_queue_.lock_);
+    std::lock_guard<std::mutex> lock(gws_queue_.lock_);
    auto ret = (*gws_queue_.queue_).get();
    if (ret != nullptr) {
      gws_queue_.ref_ct_++;
@@ -1876,7 +1875,7 @@ void GpuAgent::AcquireQueueMainScratch(ScratchInfo& scratch) {
  */
  bool large;

-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
  const size_t small_limit = scratch_pool_.size() >> 3;
  bool use_reclaim = true;

@@ -2035,7 +2034,7 @@ void GpuAgent::AcquireQueueAltScratch(ScratchInfo& scratch) {
  uint64_t size_per_wave = AlignUp(scratch.alt_size_per_thread * properties_.WaveFrontSize, 1024);
  if (size_per_wave > MAX_WAVE_SCRATCH) return;

-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);

  // Ensure mapping will be in whole pages.
  scratch.alt_size = AlignUp(scratch.alt_size, 4096);
@@ -2176,7 +2175,7 @@ uint64_t GpuAgent::TranslateTime(uint64_t tick) {
  // Limit errors due to relative frequency drift to ~0.5us.  Sync clocks at 16Hz.
  const int64_t max_extrapolation = core::Runtime::runtime_singleton_->sys_clock_freq() >> 4;

-  ScopedAcquire<KernelMutex> lock(&t1_lock_);
+  std::lock_guard<std::mutex> lock(t1_lock_);
  // Limit errors due to correlated pair certainty to ~0.5us.
  // extrapolated time < (0.5us / half clock read certainty) * delay between clock measures
  // clock read certainty is <4us.
@@ -2261,26 +2260,27 @@ hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttra
    ((uint64_t*)tma_region_host)[1] = (uint64_t)pcs_stochastic_buffers;

    if (!trap_handler_tma_region_) {
-      trap_handler_tma_region_ = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
-      if (trap_handler_tma_region_ == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+      void* mem = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0);
+      if (!mem) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+      trap_handler_tma_region_.reset(mem);

      // NearestCpuAgent owns pool returned system_allocator()
      auto cpuAgent = GetNearestCpuAgent()->public_handle();

      hsa_status_t ret =
-          AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_);
+          AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_.get());
      assert(ret == HSA_STATUS_SUCCESS);
    }

    /* On non-large BAR systems, we may not be able to access device memory, so do a DmaCopy */
-    if (DmaCopy(trap_handler_tma_region_, tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
+    if (DmaCopy(trap_handler_tma_region_.get(), tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS)
      return HSA_STATUS_ERROR;

    tma_size = 2 * sizeof(uint64_t);
-    tma_addr = trap_handler_tma_region_;
+    tma_addr = trap_handler_tma_region_.get();
  } else if (trap_handler_tma_region_) {
-    finegrain_deallocator()(trap_handler_tma_region_);
-    trap_handler_tma_region_ = NULL;
+    trap_handler_tma_region_.reset(nullptr);
  }

  // Bind the trap handler to this node.
@@ -2398,7 +2398,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
  uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
  assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen"));

-  ScopedAcquire<KernelMutex> lock(&xgmi_peer_list_lock_);
+  std::lock_guard<std::mutex> lock(xgmi_peer_list_lock_);

  for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
    uint64_t dst_handle = dst_agent.public_handle().handle;
@@ -2490,19 +2490,20 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
 void GpuAgent::Trim() {
  Agent::Trim();
  AsyncReclaimScratchQueues();
-  ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+  std::lock_guard<std::mutex> lock(scratch_lock_);
  scratch_cache_.trim(false);
 }

 void GpuAgent::InitAllocators() {
-  for (auto pool : GetNearestCpuAgent()->regions()) {
+  for (const auto& pool : GetNearestCpuAgent()->regions()) {
    if (pool->kernarg()) {
-      system_allocator_ = [pool](size_t size, size_t alignment,
+      const core::MemoryRegion* pool_ptr = pool.get();
+      system_allocator_ = [pool_ptr](size_t size, size_t alignment,
                                 MemoryRegion::AllocateFlags alloc_flags) -> void* {
        assert(alignment <= 4096);
        void* ptr = nullptr;
        return (HSA_STATUS_SUCCESS ==
-                core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, &ptr))
+                core::Runtime::runtime_singleton_->AllocateMemory(pool_ptr, size, alloc_flags, &ptr))
            ? ptr
            : nullptr;
      };
@@ -2513,14 +2514,14 @@ void GpuAgent::InitAllocators() {
  assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool.");

  // Setup this GPU's fine-grain and coarse-grain allocators.
-  for (auto region : regions()) {
-    const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region);
+  for (const auto& region : regions()) {
+    const AMD::MemoryRegion* amd_region = static_cast<const AMD::MemoryRegion*>(region.get());

-    auto region_allocator = [region](size_t size,
+    auto region_allocator = [amd_region](size_t size,
                                     MemoryRegion::AllocateFlags alloc_flags) -> void* {
      void* ptr = nullptr;
       return (HSA_STATUS_SUCCESS ==
-               core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr))
+               core::Runtime::runtime_singleton_->AllocateMemory(amd_region, size, alloc_flags, &ptr))
           ? ptr
           : nullptr;
    };
@@ -283,18 +283,18 @@ const core::MemoryRegion* RegionMemory::AgentLocal(hsa_agent_t agent, bool is_co
  assert(amd_agent->device_type() == core::Agent::kAmdGpuDevice && "Invalid agent type.");
  auto agent_local_region =
      std::find_if(amd_agent->regions().begin(), amd_agent->regions().end(),
-                   [&](const core::MemoryRegion* region) {
-                     const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region;
+                   [&](const std::shared_ptr<const core::MemoryRegion>& region) {
+                     const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region.get();
                     return amd_region->IsLocalMemory() && (!amd_region->fine_grain());
                   });
-  return agent_local_region == amd_agent->regions().end() ? nullptr : *agent_local_region;
+  return agent_local_region == amd_agent->regions().end() ? nullptr : agent_local_region->get();
 }

 const core::MemoryRegion* RegionMemory::System(bool is_code) {
  if (is_code)
-    return core::Runtime::runtime_singleton_->system_regions_coarse()[0];
+    return core::Runtime::runtime_singleton_->system_regions_coarse()[0].get();
  else
-    return core::Runtime::runtime_singleton_->system_regions_fine()[0];
+    return core::Runtime::runtime_singleton_->system_regions_fine()[0].get();
 }

 bool RegionMemory::Allocate(size_t size, size_t align, bool zero) {
@@ -48,6 +48,8 @@
 #include "core/inc/amd_memory_region.h"

 #include <algorithm>
+#include <mutex>
+#include <shared_mutex>

 #include "core/inc/runtime.h"
 #include "core/inc/amd_cpu_agent.h"
@@ -132,7 +134,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
 MemoryRegion::~MemoryRegion() {}

 hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
-  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
  return AllocateImpl(size, alloc_flags, address, agent_node_id);
 }

@@ -160,7 +162,7 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
 }

 hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
-  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
  return FreeImpl(address, size);
 }

@@ -172,7 +174,7 @@ hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {

 // TODO:  Look into a better name and/or making this process transparent to exporting.
 hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const {
-  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  std::lock_guard<std::mutex> lock(owner()->agent_memory_lock_);
  if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
  return HSA_STATUS_SUCCESS;
 }
@@ -448,7 +450,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
  std::vector<uint64_t> union_agents;
  info.size = sizeof(info);

-  ScopedAcquire<KernelMutex> lock(&access_lock_);
+  std::lock_guard<std::mutex> lock(access_lock_);

  if (core::Runtime::runtime_singleton_->PtrInfo(const_cast<void*>(ptr), &info, malloc,
                                                 &agent_count, &accessible,
@@ -512,8 +514,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,

  {  // Sequence with pointer info since queries to other fragments of the block may be adjusted by
     // this call.
-    ScopedAcquire<KernelSharedMutex::Shared> lock(
-        core::Runtime::runtime_singleton_->memory_lock_.shared());
+    std::shared_lock<std::shared_mutex> lock(core::Runtime::runtime_singleton_->memory_lock_);
    uint64_t alternate_va = 0;
    if (owner()->driver().MakeMemoryResident(ptr, size, &alternate_va, &map_flag,
                                             whitelist_nodes.size(),
@@ -1804,7 +1804,7 @@ hsa_status_t hsa_code_object_serialize(
  IS_BAD_PTR(serialized_code_object);
  IS_BAD_PTR(serialized_code_object_size);

-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
  if (!code) {
    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
  }
@@ -1982,7 +1982,7 @@ hsa_status_t hsa_code_object_get_info(
  IS_OPEN();
  IS_BAD_PTR(value);

-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
  if (!code) {
    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
  }
@@ -2039,7 +2039,7 @@ hsa_status_t hsa_code_object_get_symbol(
  IS_BAD_PTR(symbol_name);
  IS_BAD_PTR(symbol);

-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
  if (!code) {
    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
  }
@@ -2059,7 +2059,7 @@ hsa_status_t hsa_code_object_get_symbol_from_name(
  IS_BAD_PTR(symbol_name);
  IS_BAD_PTR(symbol);

-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
  if (!code) {
    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
  }
@@ -2097,7 +2097,7 @@ hsa_status_t hsa_code_object_iterate_symbols(
  IS_OPEN();
  IS_BAD_PTR(callback);

-  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object);
+  amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get();
  if (!code) {
    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
  }
@@ -759,7 +759,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
  }

  const AMD::MemoryRegion* system_region = static_cast<const AMD::MemoryRegion*>(
-      core::Runtime::runtime_singleton_->system_regions_coarse()[0]);
+      core::Runtime::runtime_singleton_->system_regions_coarse()[0].get());

  return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr);
  CATCH;
@@ -799,7 +799,7 @@ hsa_status_t hsa_amd_memory_unlock(void* host_ptr) {

  const AMD::MemoryRegion* system_region =
      reinterpret_cast<const AMD::MemoryRegion*>(
-          core::Runtime::runtime_singleton_->system_regions_fine()[0]);
+          core::Runtime::runtime_singleton_->system_regions_fine()[0].get());

  return system_region->Unlock(host_ptr);
  CATCH;
@@ -340,7 +340,7 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
    return;
  }

-  ScopedAcquire<KernelMutex> lock(&lock_);
+  std::lock_guard<std::mutex> lock(lock_);

  // Submit overflow packets.
  if (!overflow_.empty()) {
@@ -48,7 +48,7 @@ namespace rocr {
 namespace core {

 HsaEvent* InterruptSignal::EventPool::alloc() {
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
  if (events_.empty()) {
    if (!allEventsAllocated) {
      HsaEvent* evt = InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false);
@@ -64,7 +64,7 @@ HsaEvent* InterruptSignal::EventPool::alloc() {

 void InterruptSignal::EventPool::free(HsaEvent* evt) {
  if (evt == nullptr) return;
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
  events_.push_back(unique_event_ptr(evt));
 }

@@ -50,7 +50,7 @@
 namespace rocr {
 namespace core {

-KernelMutex IPCSignal::lock_;
+std::mutex IPCSignal::lock_;

 SharedMemory::SharedMemory(const hsa_amd_ipc_memory_t* handle, size_t len) {
  hsa_status_t err = Runtime::runtime_singleton_->IPCAttach(handle, len, 0, NULL, &ptr_);
@@ -85,7 +85,7 @@ Signal* IPCSignal::Attach(const hsa_amd_ipc_signal_t* ipc_signal_handle) {

  hsa_signal_t handle = SharedSignal::Convert(shared.signal());

-  ScopedAcquire<KernelMutex> lock(&lock_);
+  std::lock_guard<std::mutex> lock(lock_);
  Signal* ret = core::Signal::DuplicateHandle(handle);
  if (ret == nullptr) ret = new IPCSignal(std::move(shared));
  return ret;
@@ -48,6 +48,7 @@
 #include <string>
 #include <vector>
 #include <list>
+#include <shared_mutex>
 #if defined(__linux__)
 #include <link.h>
 #include <dlfcn.h>
@@ -119,7 +120,7 @@ bool g_use_mwaitx;
 Runtime* Runtime::runtime_singleton_ = NULL;

 hsa_status_t Runtime::Acquire() {
-  ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
+  std::lock_guard<std::mutex> boot(bootstrap_lock());

  if (runtime_singleton_ == NULL) {
    memset(log_flags, 0, sizeof(log_flags));
@@ -146,7 +147,7 @@ hsa_status_t Runtime::Acquire() {
 }

 hsa_status_t Runtime::Release() {
-  ScopedAcquire<KernelMutex> boot(&bootstrap_lock());
+  std::lock_guard<std::mutex> boot(bootstrap_lock());

  if (runtime_singleton_ == nullptr) return HSA_STATUS_ERROR_NOT_INITIALIZED;

@@ -192,7 +193,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
    agents_by_gpuid_[0] = agent;

    // Add cpu regions to the system region list.
-    for (const core::MemoryRegion* region : agent->regions()) {
+    for (auto region : agent->regions()) {
      if (region->fine_grain()) {
        system_regions_fine_.push_back(region);
      } else {
@@ -216,7 +217,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
            assert(alignment <= 4096);
            void* ptr = NULL;
            return (HSA_STATUS_SUCCESS ==
-                    core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
+                    core::Runtime::runtime_singleton_->AllocateMemory(pool.get(), size, alloc_flags,
                                                                      &ptr, agent_node_id))
                ? ptr
                : NULL;
@@ -336,7 +337,7 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
  hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
  // Track the allocation result so that it could be freed properly.
  if (status == HSA_STATUS_SUCCESS) {
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
    allocation_map_[*address] = AllocationRegion(region, size, size_requested, alloc_flags);
  }

@@ -354,7 +355,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
  MemoryRegion::AllocateFlags alloc_flags = core::MemoryRegion::AllocateNoFlags;

  {
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);

    std::map<const void*, AllocationRegion>::iterator it = allocation_map_.find(ptr);

@@ -458,7 +459,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {

 hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_callback_t callback,
                                              void* user_data) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  auto mem = allocation_map_.upper_bound(ptr);
  if (mem != allocation_map_.begin()) {
    mem--;
@@ -482,7 +483,7 @@ hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_ca
 hsa_status_t Runtime::DeregisterReleaseNotifier(void* ptr,
                                                hsa_amd_deallocation_callback_t callback) {
  hsa_status_t ret = HSA_STATUS_ERROR_INVALID_ARGUMENT;
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  auto mem = allocation_map_.upper_bound(ptr);
  if (mem != allocation_map_.begin()) {
    mem--;
@@ -552,7 +553,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
  // GPU-CPU
  // Must ensure that system memory is visible to the GPU during the copy.
  const AMD::MemoryRegion* system_region =
-      static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0]);
+      static_cast<const AMD::MemoryRegion*>(system_regions_fine_[0].get());

  void* gpuPtr = nullptr;
  const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) {
@@ -698,7 +699,7 @@ hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
  size_t alloc_size = 0;

  {
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);

    std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);

@@ -929,7 +930,7 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents,
  *size = info.SizeInBytes;
  *ptr = info.MemoryAddress;

-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  allocation_map_[info.MemoryAddress] = AllocationRegion(
      nullptr, info.SizeInBytes, info.SizeInBytes, core::MemoryRegion::AllocateNoFlags);

@@ -1055,7 +1056,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi

  {  // memory_lock protects access to the NMappedNodes array and fragment user data since these may
     // change with calls to memory APIs.
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);

    if (VMemoryPtrInfo(ptr, &retInfo, alloc, num_agents_accessible, accessible) ==
        HSA_STATUS_SUCCESS) {
@@ -1196,7 +1197,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi

 hsa_status_t Runtime::SetPtrInfoData(const void* ptr, void* userptr) {
  {  // Use allocation map if possible to handle fragments.
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
    const auto& it = allocation_map_.find(ptr);
    if (it != allocation_map_.end()) {
      it->second.user_ptr = userptr;
@@ -1307,7 +1308,7 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) {
     size_t len = 0;

     // Search for registered export pointer
-     ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
+     std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
     for (auto& conns : ipc_sock_server_conns_) {
       if (conn_handle == conns.first) {
         ptr = reinterpret_cast<void *>(conn_handle);
@@ -1372,7 +1373,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
    if (useFrag) {
      handle->handle[6] |= 0x80000000 | fragOffset;
      // Prevent realloction of fragment for better performance.
-      ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
+      std::shared_lock<std::shared_mutex> lock(memory_lock_);
      err = allocation_map_[ptr].region->IPCFragmentExport(ptr);
      assert(err == HSA_STATUS_SUCCESS && "Region inconsistent with address map.");
    }
@@ -1439,7 +1440,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han

  close(dmabuf_fd);

-  ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
+  std::lock_guard<std::mutex> lock(ipc_sock_server_lock_);
 #if defined(__linux__)
  if (!ipc_sock_server_conns_.size()) { // create new runtime socket server
    struct sockaddr_un address;
@@ -1549,7 +1550,7 @@ int Runtime::IPCClientImport(uint32_t conn_handle, uint64_t dmabuf_fd_handle,

      // Store the buffer object handle in allocation map for later use
      if (err == HSAKMT_STATUS_SUCCESS) {
-        ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+        std::lock_guard<std::shared_mutex> lock(memory_lock_);
        allocation_map_[*importAddress] =
            AllocationRegion(nullptr, *importSize, *importSize, core::MemoryRegion::AllocateNoFlags);
        allocation_map_[*importAddress].ldrm_bo = res.buf_handle;
@@ -1579,7 +1580,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
      importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
      len = Min(len, importSize - fragOffset);
    }
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::lock_guard<std::shared_mutex> lock(memory_lock_);
    allocation_map_[importAddress] =
        AllocationRegion(nullptr, len, len, core::MemoryRegion::AllocateNoFlags);
    allocation_map_[importAddress].ldrm_bo = ldrm_bo;
@@ -1711,7 +1712,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
 hsa_status_t Runtime::IPCDetach(void* ptr) {
  bool ldrmImportCleaned = false;
  {  // Handle imported fragments.
-    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+    std::unique_lock<std::shared_mutex> lock(memory_lock_);
    const auto& it = allocation_map_.find(ptr);
    if (it != allocation_map_.end()) {
      if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -1728,7 +1729,7 @@ hsa_status_t Runtime::IPCDetach(void* ptr) {
      assert(!"Unimplemented!");
 #endif
      allocation_map_.erase(it);
-      lock.Release();  // Can't hold memory lock when using pointer info.
+      lock.unlock();  // Can't hold memory lock when using pointer info.

      PtrInfoBlockData block = {};
      hsa_amd_pointer_info_t info = {};
@@ -1954,7 +1955,7 @@ void Runtime::AsyncEventsPool::clear() {
 }

 Runtime::AsyncEventItem* Runtime::AsyncEventsPool::alloc() {
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
  if (free_list_.empty()) {
    AsyncEventItem* block = reinterpret_cast<AsyncEventItem*>(
        allocate_()(block_size_ * sizeof(AsyncEventItem), __alignof(AsyncEventItem), core::MemoryRegion::AllocateNonPaged, 0));
@@ -1985,7 +1986,7 @@ void Runtime::AsyncEventsPool::free(AsyncEventItem* ptr) {
  if (ptr == nullptr) return;

  ptr->~AsyncEventItem();
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);

  ifdebug {
    bool valid = false;
@@ -2059,33 +2060,33 @@ void Runtime::BindErrorHandlers() {

  // Create memory event with manual reset to avoid racing condition
  // with driver in case of multiple concurrent VM faults.
-  vm_fault_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true);
+  vm_fault_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true));

  // Create an interrupt signal object to contain the memory event.
  // This signal object will be registered with the async handler global
  // thread.
-  vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_);
+  vm_fault_signal_.reset(new core::InterruptSignal(0, vm_fault_event_.get()));

  if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) {
    assert(false && "Failed on creating VM fault signal");
    return;
  }

-  SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0,
-                        VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_));
+  SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
+                        VMFaultHandler, reinterpret_cast<void*>(vm_fault_signal_.get()));

  // Create HW exception event which is for Non-RAS events
-  hw_exception_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true);
+  hw_exception_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true));

-  hw_exception_signal_ = new core::InterruptSignal(0, hw_exception_event_);
+  hw_exception_signal_.reset(new core::InterruptSignal(0, hw_exception_event_.get()));

  if (!hw_exception_signal_->IsValid() || hw_exception_signal_->EopEvent() == NULL) {
    assert(false && "Failed on creating HW Exception signal");
    return;
  }

-  SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_), HSA_SIGNAL_CONDITION_NE, 0,
-                        HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_));
+  SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0,
+                        HwExceptionHandler, reinterpret_cast<void*>(hw_exception_signal_.get()));
 }

 bool Runtime::HwExceptionHandler(hsa_signal_value_t val, void* arg) {
@@ -2262,7 +2263,8 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
 }

 void Runtime::PrintMemoryMapNear(void* ptr) {
-  runtime_singleton_->memory_lock_.Acquire();
+  std::unique_lock<std::shared_mutex> lock(runtime_singleton_->memory_lock_);
+
  auto it = runtime_singleton_->allocation_map_.upper_bound(ptr);
  for (int i = 0; i < 2; i++) {
    if (it != runtime_singleton_->allocation_map_.begin()) it--;
@@ -2287,8 +2289,9 @@ void Runtime::PrintMemoryMapNear(void* ptr) {
    it++;
  }
  fprintf(stderr, "\n");
-  it = start;
-  runtime_singleton_->memory_lock_.Release();
+  it = start;  
+  lock.unlock();
+  
  hsa_amd_pointer_info_t info = {};
  PtrInfoBlockData block = {};
  uint32_t count = 0;
@@ -2408,7 +2411,7 @@ hsa_status_t Runtime::Load() {

  BindErrorHandlers();

-  loader_ = amd::hsa::loader::Loader::Create(&loader_context_);
+  loader_.reset(amd::hsa::loader::Loader::Create(&loader_context_));

  // Load extensions
  LoadExtensions();
@@ -2449,8 +2452,8 @@ void Runtime::Unload() {
  UnloadTools();
  UnloadExtensions();

-  amd::hsa::loader::Loader::Destroy(loader_);
-  loader_ = nullptr;
+  amd::hsa::loader::Loader::Destroy(loader_.get());
+  loader_.reset();

  for(auto nodeAgent: agents_by_node_) {
    for (auto agent: nodeAgent.second)
@@ -2462,17 +2465,17 @@ void Runtime::Unload() {

  if (vm_fault_signal_ != nullptr) {
    vm_fault_signal_->DestroySignal();
-    vm_fault_signal_ = nullptr;
+    vm_fault_signal_.reset();
  }
-  core::InterruptSignal::DestroyEvent(vm_fault_event_);
-  vm_fault_event_ = nullptr;
+  
+  vm_fault_event_.reset();

  if (hw_exception_signal_ != nullptr) {
    hw_exception_signal_->DestroySignal();
-    hw_exception_signal_ = nullptr;
+    hw_exception_signal_.reset();
  }
-  core::InterruptSignal::DestroyEvent(hw_exception_event_);
-  hw_exception_event_ = nullptr;
+  
+  hw_exception_event_.reset();

  SharedSignalPool.clear();

@@ -2890,7 +2893,7 @@ void Runtime::AsyncEvents::Clear() {

 hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
                                                  void* data) {
-  ScopedAcquire<KernelMutex> lock(&system_event_lock_);
+  std::lock_guard<std::mutex> lock(system_event_lock_);
  system_event_handlers_.push_back(
      std::make_pair(AMD::callback_t<hsa_amd_system_event_callback_t>(callback), data));
  return HSA_STATUS_SUCCESS;
@@ -2898,7 +2901,7 @@ hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_

 std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
 Runtime::GetSystemEventHandlers() {
-  ScopedAcquire<KernelMutex> lock(&system_event_lock_);
+  std::lock_guard<std::mutex> lock(system_event_lock_);
  return system_event_handlers_;
 }

@@ -3269,7 +3272,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
  }

  {
-    ScopedAcquire<KernelMutex> lock(&prefetch_lock_);
+    std::lock_guard<std::mutex> lock(prefetch_lock_);
    // Remove all fully overlapped and trim partially overlapped ranges.
    // Get iteration bounds
    auto start = prefetch_map_.upper_bound(base);
@@ -3332,7 +3335,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,

  // Remove the prefetch's ranges from the map.
  static auto removePrefetchRanges = [](PrefetchOp* op) {
-    ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
+    std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
    auto it = op->prefetch_map_entry;
    while (it != Runtime::runtime_singleton_->prefetch_map_.end()) {
      auto next = it->second.next;
@@ -3389,7 +3392,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {

  std::vector<std::pair<uintptr_t, uintptr_t>> holes;

-  ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
+  std::lock_guard<std::mutex> lock(Runtime::runtime_singleton_->prefetch_lock_);
  auto start = prefetch_map_.upper_bound(base);
  if (start != prefetch_map_.begin()) start--;
  auto stop = prefetch_map_.lower_bound(end);
@@ -3441,7 +3444,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
 hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, uint64_t* offset,
                                   uint64_t flags) {
 #ifdef __linux__
-  ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
+  std::shared_lock<std::shared_mutex> lock(memory_lock_);
  // Lookup containing allocation.
  auto mem = allocation_map_.upper_bound(ptr);
  if (mem != allocation_map_.begin()) {
@@ -3507,7 +3510,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add

  if (!alignment) alignment = rocr::os::PageSize();

-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);

  if (flags & HSA_AMD_VMEM_ADDRESS_NO_REGISTER) {
    size_t requested = size + alignment - rocr::os::PageSize();
@@ -3548,7 +3551,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add
 }

 hsa_status_t Runtime::VMemoryAddressFree(void* va, size_t size) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  std::map<const void*, AddressHandle>::iterator it = reserved_address_map_.find(va);

  if (it == reserved_address_map_.end()) {
@@ -3580,7 +3583,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
  if (!IsMultipleOf(size, memRegion->GetPageSize()))
    return HSA_STATUS_ERROR_INVALID_ARGUMENT;

-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  ThunkHandle user_mode_driver_handle;
  hsa_status_t status =
      region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0);
@@ -3597,7 +3600,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
 }

 hsa_status_t Runtime::VMemoryHandleRelease(hsa_amd_vmem_alloc_handle_t memoryOnlyHandle) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  auto memoryHandleIt = memory_handle_map_.find(MemoryHandle::Convert(memoryOnlyHandle));

  if (memoryHandleIt == memory_handle_map_.end()) {
@@ -3628,7 +3631,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
  uint64_t offset = 0, ret;
  uint64_t drm_cpu_addr = 0;

-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  auto addressHandle = VMemoryFindReservedAddressHandle(va);
  if (addressHandle == nullptr ||
      reinterpret_cast<uint8_t*>(va) + size >
@@ -3703,7 +3706,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
 }

 hsa_status_t Runtime::VMemoryHandleUnmap(void* va, size_t size) {
-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);
  std::list<std::pair<void*, MappedHandle*>> mappedHandles;

  // va + size may consist of multiple MappedHandle's.
@@ -3921,7 +3924,7 @@ hsa_status_t Runtime::VMemorySetAccess(void* va, size_t size,
    if (targetAgent == NULL || !targetAgent->IsValid()) return HSA_STATUS_ERROR_INVALID_AGENT;
  }

-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);

  auto addressHandle = VMemoryFindReservedAddressHandle(va);
  if (addressHandle == nullptr ||
@@ -4014,7 +4017,7 @@ hsa_status_t Runtime::VMemoryGetAccess(const void* va, hsa_access_permission_t*
  *perms = HSA_ACCESS_PERMISSION_NONE;
  bool mappedHandleFound = false;

-  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
+  std::lock_guard<std::shared_mutex> lock(memory_lock_);

  auto mappedHandleIt = mapped_handle_map_.upper_bound(va);
  if (mappedHandleIt != mapped_handle_map_.begin()) {
@@ -4076,8 +4079,8 @@ hsa_status_t Runtime::VMemoryImportShareableHandle(int dmabuf_fd,
      return;
    }

-    for (const core::MemoryRegion* region : agent->regions()) {
-      const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region);
+    for (const auto& region : agent->regions()) {
+      const AMD::MemoryRegion* amd_region = reinterpret_cast<const AMD::MemoryRegion*>(region.get());

      // TODO: Verify that this works on a system with FINE_GRAINED memory.
      // System's with FINE_GRAINED will have both COARSE and FINE grain... need to get the
@@ -58,7 +58,7 @@
 namespace rocr {
 namespace core {

-KernelMutex Signal::ipcLock_;
+std::mutex Signal::ipcLock_;
 std::map<decltype(hsa_signal_t::handle), Signal*> Signal::ipcMap_;

 void SharedSignalPool_t::clear() {
@@ -76,7 +76,7 @@ void SharedSignalPool_t::clear() {
 }

 SharedSignal* SharedSignalPool_t::alloc() {
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);
  if (free_list_.empty()) {
    SharedSignal* block = reinterpret_cast<SharedSignal*>(
        allocate_()(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), core::MemoryRegion::AllocateNonPaged, 0));
@@ -109,7 +109,7 @@ void SharedSignalPool_t::free(SharedSignal* ptr) {
  if (ptr == nullptr) return;

  ptr->~SharedSignal();
-  ScopedAcquire<HybridMutex> lock(&lock_);
+  std::lock_guard<HybridMutex> lock(lock_);

  ifdebug {
    bool valid = false;
@@ -134,7 +134,7 @@ LocalSignal::LocalSignal(hsa_signal_value_t initial_value, bool exportable)
 }

 void Signal::registerIpc() {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
  auto handle = Convert(this);
  assert(ipcMap_.find(handle.handle) == ipcMap_.end() &&
         "Can't register the same IPC signal twice.");
@@ -142,7 +142,7 @@ void Signal::registerIpc() {
 }

 bool Signal::deregisterIpc() {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
  if (refcount_ != 0) return false;
  auto handle = Convert(this);
  const auto& it = ipcMap_.find(handle.handle);
@@ -152,14 +152,14 @@ bool Signal::deregisterIpc() {
 }

 Signal* Signal::lookupIpc(hsa_signal_t signal) {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
  const auto& it = ipcMap_.find(signal.handle);
  if (it == ipcMap_.end()) return nullptr;
  return it->second;
 }

 Signal* Signal::duplicateIpc(hsa_signal_t signal) {
-  ScopedAcquire<KernelMutex> lock(&ipcLock_);
+  std::lock_guard<std::mutex> lock(ipcLock_);
  const auto& it = ipcMap_.find(signal.handle);
  if (it == ipcMap_.end()) return nullptr;
  it->second->refcount_++;
@@ -125,16 +125,16 @@ template <typename T> class lazy_ptr {
 private:
  mutable std::unique_ptr<T> obj;
  mutable std::function<T*(void)> func;
-  mutable KernelMutex lock;
+  mutable std::mutex lock;

  // Separated from make to improve inlining.
  void make_body(bool block) const {
    if (block) {
-      lock.Acquire();
-    } else if (!lock.Try()) {
+      lock.lock();
+    } else if (!lock.try_lock()) {
      return;
    }
-    MAKE_SCOPE_GUARD([&]() { lock.Release(); });
+    MAKE_SCOPE_GUARD([&]() { lock.unlock(); });
    if (func == nullptr) return;
    T* ptr = func();
    obj.reset(ptr);
@@ -90,6 +90,11 @@ class HybridMutex {
      os::PostSemaphore(sem_);
  }

+  // To add compatibility with std::lock_guard
+  void lock() { Acquire(); }
+  void unlock() { Release(); }
+  bool try_lock() { return Try(); }
+
 private:
  std::atomic<int> lock_;
  os::Semaphore sem_;
@@ -100,27 +105,6 @@ class HybridMutex {
  DISALLOW_COPY_AND_ASSIGN(HybridMutex);
 };

-
-/// @brief: a class represents a kernel mutex.
-/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
-/// until the lock is released (Best for long waits, though anything using
-/// a kernel object is a long wait).
-class KernelMutex {
- public:
-  KernelMutex() { lock_ = os::CreateMutex(); }
-  ~KernelMutex() { os::DestroyMutex(lock_); }
-
-  bool Try() { return os::TryAcquireMutex(lock_); }
-  bool Acquire() { return os::AcquireMutex(lock_); }
-  void Release() { os::ReleaseMutex(lock_); }
-
- private:
-  os::Mutex lock_;
-
-  /// @brief: Disable copiable and assignable ability.
-  DISALLOW_COPY_AND_ASSIGN(KernelMutex);
-};
-
 /// @brief: represents a spin lock.
 /// For very short hold durations on the order of the thread scheduling
 /// quanta or less.
@@ -143,6 +127,11 @@ class SpinMutex {
  }
  void Release() { lock_ = 0; }

+  // To add compatibility with std::lock_guard
+  void lock() { Acquire(); }
+  void unlock() { Release(); }
+  bool try_lock() { return Try(); }
+
 private:
  std::atomic<int> lock_;

@@ -167,124 +156,6 @@ class KernelEvent {
  DISALLOW_COPY_AND_ASSIGN(KernelEvent);
 };

-/// @brief: represents a yielding shared mutex.
-/// aka read/write mutex
-class KernelSharedMutex {
- public:
-  /// @brief: Interfaces ScopedAcquire to shared operations.
-  class Shared {
-   public:
-    explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
-    bool Try() { return lock_->TryShared(); }
-    bool Acquire() { return lock_->AcquireShared(); }
-    void Release() { lock_->ReleaseShared(); }
-
-   private:
-    KernelSharedMutex* lock_;
-  };
-
-  KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
-  ~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
-
-  // Exclusive mode operations
-  bool Try() { return os::TryAcquireSharedMutex(lock_); }
-  bool Acquire() { return os::AcquireSharedMutex(lock_); }
-  void Release() { os::ReleaseSharedMutex(lock_); }
-
-  // Shared mode operations
-  bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
-  bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
-  void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
-
-  // Return shared operations interface
-  Shared shared() { return Shared(this); }
-
- private:
-  os::SharedMutex lock_;
-
-  /// @brief: Disable copiable and assignable ability.
-  DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
-};
-
-/// @brief: Type trait to identify mutex types
-template <class T> class isMutex {
- public:
-  enum { value = false };
-};
-template <> class isMutex<HybridMutex> {
- public:
-  enum { value = true };
-};
-template <> class isMutex<KernelMutex> {
- public:
-  enum { value = true };
-};
-template <> class isMutex<SpinMutex> {
- public:
-  enum { value = true };
-};
-template <> class isMutex<KernelSharedMutex> {
- public:
-  enum { value = true };
-};
-
-/// @brief: A class behaves as a lock in a scope. When trying to enter into the
-/// critical section, creat a object of this class. After the control path goes
-/// out of the scope, it will release the lock automatically.
-template <class LockType> class ScopedAcquire {
- public:
-  /// @brief: When constructing, acquire the lock.
-  /// @param: lock(Input), pointer to an existing lock.
-  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
-    static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
-    lock_.Acquire();
-  }
-  explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
-    static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
-    lock_.Acquire();
-  }
-
-  /// @brief: when destructing, release the lock.
-  ~ScopedAcquire() {
-    if (doRelease) lock_.Release();
-  }
-
-  /// @brief: Release the lock early.  Avoid using when possible.
-  void Release() {
-    lock_.Release();
-    doRelease = false;
-  }
-
- private:
-  /// @brief: Adapts between pointers to mutex types and mutex pointer types.
-  template <class T, bool B> class container {
-   public:
-    container(T* lock) : lock_(lock) {}
-    __forceinline bool Acquire() { return lock_->Acquire(); }
-    __forceinline void Release() { return lock_->Release(); }
-
-   private:
-    T* lock_;
-  };
-
-  /// @brief: Specialization for mutex pointer types.
-  template <class T> class container<T, false> {
-   public:
-    container(T lock) : lock_(lock) {}
-    __forceinline bool Acquire() { return lock_.Acquire(); }
-    __forceinline void Release() { return lock_.Release(); }
-
-   private:
-    T lock_;
-  };
-
-  container<LockType, isMutex<LockType>::value> lock_;
-  bool doRelease;
-
-  /// @brief: Disable copiable and assignable ability.
-  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
-};
-
 }  // namespace rocr

 #endif  // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
@@ -286,11 +286,6 @@ namespace code {
      }
    }

-    AmdHsaCode::~AmdHsaCode()
-    {
-      for (Symbol* sym : symbols) { delete sym; }
-    }
-
    bool AmdHsaCode::PullElf()
    {
      uint32_t majorVersion, minorVersion;
@@ -330,7 +325,7 @@ namespace code {
      }
      for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) {
        amd::elf::Symbol* elfsym = img->symtab()->symbol(i);
-        Symbol* sym = 0;
+        std::shared_ptr<Symbol> sym;
        switch (elfsym->type()) {
        case STT_AMDGPU_HSA_KERNEL: {
          amd::elf::Section* sec = elfsym->section();
@@ -347,12 +342,12 @@ namespace code {
            out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
            return false;
          }
-          sym = new KernelSymbol(elfsym, &akc);
+          sym = std::make_shared<KernelSymbol>(elfsym, &akc);
          break;
        }
        case STT_OBJECT:
        case STT_COMMON:
-          sym = new VariableSymbol(elfsym);
+          sym = std::make_shared<VariableSymbol>(elfsym);
          break;
        default:
          break; // Skip unknown symbols.
@@ -924,9 +919,9 @@ namespace code {
        std::string(module_name ? module_name : ""),
        std::string(symbol_name)
      );
-      for (Symbol* sym : symbols) {
+      for (const auto& sym : symbols) {
        if (sym->Name() == mname) {
-          *s = Symbol::ToHandle(sym);
+          *s = Symbol::ToHandle(sym.get());
          return HSA_STATUS_SUCCESS;
        }
      }
@@ -940,8 +935,8 @@ namespace code {
                                  void* data),
                                void* data)
    {
-      for (Symbol* sym : symbols) {
-        hsa_code_symbol_t s = Symbol::ToHandle(sym);
+      for (const auto& sym : symbols) {
+        hsa_code_symbol_t s = Symbol::ToHandle(sym.get());
        hsa_status_t status = callback(code_object, s, data);
        if (status != HSA_STATUS_SUCCESS) { return status; }
      }
@@ -1144,8 +1139,8 @@ namespace code {
    {
      if (nullptr == img) { return nullptr; }
      if (!section) { section = HsaText(); }
-      symbols.push_back(new KernelSymbol(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
-      return symbols.back();
+      symbols.push_back(std::make_shared<KernelSymbol>(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr));
+      return symbols.back().get();
    }

    Symbol* AmdHsaCode::AddVariableSymbol(const std::string &name,
@@ -1157,8 +1152,8 @@ namespace code {
                                          uint64_t size)
    {
      if (nullptr == img) { return nullptr; }
-      symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
-      return symbols.back();
+      symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(section, name, value, size, type, binding, other)));
+      return symbols.back().get();
    }

    void AmdHsaCode::AddSectionSymbols()
@@ -1166,16 +1161,16 @@ namespace code {
      if (nullptr == img) { return; }
      for (size_t i = 0; i < dataSections.size(); ++i) {
        if (dataSections[i] && dataSections[i]->flags() & SHF_ALLOC) {
-          symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
+          symbols.push_back(std::make_shared<VariableSymbol>(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL)));
        }
      }
    }

    Symbol* AmdHsaCode::GetSymbolByElfIndex(size_t index)
    {
-      for (auto &s : symbols) {
+      for (const auto &s : symbols) {
        if (s && index == s->Index()) {
-          return s;
+          return s.get();
        }
      }
      return nullptr;
@@ -1185,7 +1180,7 @@ namespace code {
    {
      for (auto &s : symbols) {
        if (s && n == s->Name()) {
-          return s;
+          return s.get();
        }
      }
      return nullptr;
@@ -1747,14 +1742,13 @@ namespace code {
      return false;
    }

-      AmdHsaCode* AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
+      const std::shared_ptr<AmdHsaCode>& AmdHsaCodeManager::FromHandle(hsa_code_object_t c)
      {
        CodeMap::iterator i = codeMap.find(c.handle);
        if (i == codeMap.end()) {
-          AmdHsaCode* code = new AmdHsaCode();
+          std::shared_ptr<AmdHsaCode> code = std::make_shared<AmdHsaCode>();
          const void* buffer = reinterpret_cast<const void*>(c.handle);
          if (!code->InitAsBuffer(buffer, 0)) {
-            delete code;
            return 0;
          }
          codeMap[c.handle] = code;
@@ -1770,7 +1764,7 @@ namespace code {
          // Currently, we do not always create map entry for every code object buffer.
          return true;
        }
-        delete i->second;
+        i->second.reset();
        codeMap.erase(i);
        return true;
      }
@@ -1798,7 +1792,7 @@ namespace code {
      }
      for (size_t i = 0; i < img->getSymbolTable()->symbolCount(); ++i) {
        amd::elf::Symbol* elfsym = img->getSymbolTable()->symbol(i);
-        Symbol* sym = 0;
+        std::shared_ptr<Symbol> sym;
        switch (elfsym->type()) {
        case STT_AMDGPU_HSA_KERNEL: {
          amd::elf::Section* sec = elfsym->section();
@@ -1815,12 +1809,12 @@ namespace code {
            out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl;
            return false;
          }
-          sym = new KernelSymbolV2(elfsym, &akc);
+          sym = std::make_shared<KernelSymbolV2>(elfsym, &akc);
          break;
        }
        case STT_OBJECT:
        case STT_COMMON:
-          sym = new VariableSymbolV2(elfsym);
+          sym = std::make_shared<VariableSymbolV2>(elfsym);
          break;
        default:
          break; // Skip unknown symbols.
@@ -186,7 +186,6 @@ void Loader::Destroy(Loader *loader)
  _amdgpu_r_debug.r_map = nullptr;
  _amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT;
  r_debug_tail() = nullptr;
-  delete loader;
 }

 Executable* AmdHsaCodeLoader::CreateExecutable(
@@ -194,8 +193,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
 {
  WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);

-  executables.push_back(new ExecutableImpl(profile, context, executables.size(), default_float_rounding_mode));
-  return executables.back();
+  executables.push_back(std::make_shared<ExecutableImpl>(profile, context, executables.size(), default_float_rounding_mode));
+  return executables.back().get();
 }

 Executable* AmdHsaCodeLoader::CreateExecutable(
@@ -206,8 +205,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
 {
  WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);

-  executables.push_back(new ExecutableImpl(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
-  return executables.back();
+  executables.push_back(std::make_shared<ExecutableImpl>(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
+  return executables.back().get();
 }

 static void AddCodeObjectInfoIntoDebugMap(link_map* map) {
@@ -254,7 +253,7 @@ hsa_status_t AmdHsaCodeLoader::FreezeExecutable(Executable *executable, const ch
  atomic::Fence(std::memory_order_acq_rel);
  _loader_debug_state();
  atomic::Fence(std::memory_order_acq_rel);
-  for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
+  for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
    AddCodeObjectInfoIntoDebugMap(&(lco->r_debug_info));
  }
  atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
@@ -270,14 +269,13 @@ void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) {
  atomic::Fence(std::memory_order_acq_rel);
  _loader_debug_state();
  atomic::Fence(std::memory_order_acq_rel);
-  for (auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
+  for (const auto &lco : reinterpret_cast<ExecutableImpl*>(executable)->loaded_code_objects) {
    RemoveCodeObjectInfoFromDebugMap(&(lco->r_debug_info));
  }
  atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release);
  _loader_debug_state();

-  executables[((ExecutableImpl*)executable)->id()] = nullptr;
-  delete executable;
+  executables[static_cast<ExecutableImpl*>(executable)->id()].reset();
 }

 hsa_status_t AmdHsaCodeLoader::IterateExecutables(
@@ -289,9 +287,9 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables(
  WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
  assert(callback);

-  for (auto &exec : executables) {
+  for (const auto &exec : executables) {
    if(exec != nullptr){
-      hsa_status_t status = callback(Executable::Handle(exec), data);
+      hsa_status_t status = callback(Executable::Handle(exec.get()), data);
      if (status != HSA_STATUS_SUCCESS) {
        return status;
      }
@@ -318,7 +316,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
  this->EnableReadOnlyMode();

  size_t actual_num_segment_descriptors = 0;
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
    if (executable) {
      actual_num_segment_descriptors += executable->GetNumSegmentDescriptors();
    }
@@ -335,7 +333,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
  }

  size_t i = 0;
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
    if (executable) {
      i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i);
    }
@@ -352,7 +350,7 @@ uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address)
    return 0;
  }

-  for (auto &exec : executables) {
+  for (const auto &exec : executables) {
    if (exec != nullptr) {
      uint64_t host_address = exec->FindHostAddress(device_address);
      if (host_address != 0) {
@@ -371,9 +369,9 @@ void AmdHsaCodeLoader::PrintHelp(std::ostream& out)
 void AmdHsaCodeLoader::EnableReadOnlyMode()
 {
  rw_lock_.ReaderLock();
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
    if (executable) {
-      ((ExecutableImpl*)executable)->EnableReadOnlyMode();
+      ((ExecutableImpl*)executable.get())->EnableReadOnlyMode();
    }
  }
 }
@@ -381,9 +379,9 @@ void AmdHsaCodeLoader::EnableReadOnlyMode()
 void AmdHsaCodeLoader::DisableReadOnlyMode()
 {
  rw_lock_.ReaderUnlock();
-  for (auto &executable : executables) {
+  for (const auto &executable : executables) {
    if (executable) {
-      ((ExecutableImpl*)executable)->DisableReadOnlyMode();
+      ((ExecutableImpl*)executable.get())->DisableReadOnlyMode();
    }
  }
 }
@@ -781,18 +779,10 @@ ExecutableImpl::ExecutableImpl(
 }

 ExecutableImpl::~ExecutableImpl() {
-  for (ExecutableObject* o : objects) {
+  for (const auto& o : objects) {
    o->Destroy();
-    delete o;
  }
  objects.clear();
-
-  for (auto &symbol_entry : program_symbols_) {
-    delete symbol_entry.second;
-  }
-  for (auto &symbol_entry : agent_symbols_) {
-    delete symbol_entry.second;
-  }
 }

 hsa_status_t ExecutableImpl::DefineProgramExternalVariable(
@@ -812,7 +802,7 @@ hsa_status_t ExecutableImpl::DefineProgramExternalVariable(

  program_symbols_.insert(
    std::make_pair(std::string(name),
-                   new VariableSymbol(true,
+                   std::make_shared<VariableSymbol>(true,
                                      "", // Only program linkage symbols can be
                                          // defined.
                                      std::string(name),
@@ -848,7 +838,7 @@ hsa_status_t ExecutableImpl::DefineAgentExternalVariable(

  auto insert_status = agent_symbols_.insert(
    std::make_pair(std::make_pair(std::string(name), agent),
-                   new VariableSymbol(true,
+                   std::make_shared<VariableSymbol>(true,
                                      "", // Only program linkage symbols can be
                                          // defined.
                                      std::string(name),
@@ -896,14 +886,14 @@ Symbol* ExecutableImpl::GetSymbolInternal(
  if (!agent) {
    auto program_symbol = program_symbols_.find(mangled_name);
    if (program_symbol != program_symbols_.end()) {
-      return program_symbol->second;
+      return program_symbol->second.get();
    }
    return nullptr;
  }

  auto agent_symbol = agent_symbols_.find(std::make_pair(mangled_name, *agent));
  if (agent_symbol != agent_symbols_.end()) {
-    return agent_symbol->second;
+    return agent_symbol->second.get();
  }
  return nullptr;
 }
@@ -916,14 +906,14 @@ hsa_status_t ExecutableImpl::IterateSymbols(

  for (auto &symbol_entry : program_symbols_) {
    hsa_status_t hsc =
-      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
+      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
    if (HSA_STATUS_SUCCESS != hsc) {
      return hsc;
    }
  }
  for (auto &symbol_entry : agent_symbols_) {
    hsa_status_t hsc =
-      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
+      callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
    if (HSA_STATUS_SUCCESS != hsc) {
      return hsc;
    }
@@ -948,7 +938,7 @@ hsa_status_t ExecutableImpl::IterateAgentSymbols(
    }

    hsa_status_t status = callback(
-        Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second),
+        Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second.get()),
        data);
    if (status != HSA_STATUS_SUCCESS) {
      return status;
@@ -968,7 +958,7 @@ hsa_status_t ExecutableImpl::IterateProgramSymbols(

  for (auto &symbol_entry : program_symbols_) {
    hsa_status_t status = callback(
-        Executable::Handle(this), Symbol::Handle(symbol_entry.second), data);
+        Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data);
    if (status != HSA_STATUS_SUCCESS) {
      return status;
    }
@@ -987,10 +977,10 @@ hsa_status_t ExecutableImpl::IterateLoadedCodeObjects(
  ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
  assert(callback);

-  for (auto &loaded_code_object : loaded_code_objects) {
+  for (const auto& loaded_code_object : loaded_code_objects) {
    hsa_status_t status = callback(
        Executable::Handle(this),
-        LoadedCodeObject::Handle(loaded_code_object),
+        LoadedCodeObject::Handle(loaded_code_object.get()),
        data);
    if (status != HSA_STATUS_SUCCESS) {
      return status;
@@ -1004,7 +994,7 @@ size_t ExecutableImpl::GetNumSegmentDescriptors()
 {
  // assuming we are in readonly mode.
  size_t actual_num_segment_descriptors = 0;
-  for (auto &obj : loaded_code_objects) {
+  for (const auto &obj : loaded_code_objects) {
    actual_num_segment_descriptors += obj->LoadedSegments().size();
  }
  return actual_num_segment_descriptors;
@@ -1020,7 +1010,7 @@ size_t ExecutableImpl::QuerySegmentDescriptors(
  assert(first_empty_segment_descriptor < total_num_segment_descriptors);

  size_t i = first_empty_segment_descriptor;
-  for (auto &obj : loaded_code_objects) {
+  for (const auto &obj : loaded_code_objects) {
    assert(i < total_num_segment_descriptors);
    for (auto &seg : obj->LoadedSegments()) {
      segment_descriptors[i].agent = seg->Agent();
@@ -1084,11 +1074,11 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
    return execHandle;
  }

-  for (auto &exec : executables) {
+  for (const auto &exec : executables) {
    if (exec != nullptr) {
      uint64_t host_address = exec->FindHostAddress(device_address);
      if (host_address != 0) {
-        return Executable::Handle(exec);
+        return Executable::Handle(exec.get());
      }
    }
  }
@@ -1098,7 +1088,7 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address)
 uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address)
 {
  ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
-  for (auto &obj : loaded_code_objects) {
+  for (const auto &obj : loaded_code_objects) {
    assert(obj);
    for (auto &seg : obj->LoadedSegments()) {
      assert(seg);
@@ -1224,7 +1214,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(

  uint32_t codeNum = NextCodeObjectNum();

-  code.reset(new code::AmdHsaCode());
+  code = std::make_unique<code::AmdHsaCode>();

  std::string substituteFileName;
  for (const Substitute& ss : substitutes) {
@@ -1306,8 +1296,8 @@ hsa_status_t ExecutableImpl::LoadCodeObject(

  hsa_status_t status;

-  objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize()));
-  loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back());
+  objects.push_back(std::make_shared<LoadedCodeObjectImpl>(this, agent, code->ElfData(), code->ElfSize()));
+  loaded_code_objects.push_back(std::static_pointer_cast<LoadedCodeObjectImpl>(objects.back()));

  status = LoadSegments(agent, code.get(), majorVersion);
  if (status != HSA_STATUS_SUCCESS) return status;
@@ -1338,7 +1328,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
  loaded_code_objects.back()->r_debug_info.l_prev = nullptr;
  loaded_code_objects.back()->r_debug_info.l_next = nullptr;

-  if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back()); }
+  if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back().get()); }
  return HSA_STATUS_SUCCESS;
 }

@@ -1376,18 +1366,18 @@ hsa_status_t ExecutableImpl::LoadSegmentsV2(hsa_agent_t agent,
      AMD_ISA_ALIGN_BYTES, true);
  if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;

-  Segment *load_segment = new Segment(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
+  std::shared_ptr<Segment> load_segment = std::make_shared<Segment>(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT,
      ptr, size, vaddr, c->DataSegment(0)->offset());
  if (!load_segment) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;

  hsa_status_t status = HSA_STATUS_SUCCESS;
  for (size_t i = 0; i < c->DataSegmentCount(); ++i) {
-    status = LoadSegmentV2(c->DataSegment(i), load_segment);
+    status = LoadSegmentV2(c->DataSegment(i), load_segment.get());
    if (status != HSA_STATUS_SUCCESS) return status;
  }

  objects.push_back(load_segment);
-  loaded_code_objects.back()->LoadedSegments().push_back(load_segment);
+  loaded_code_objects.back()->LoadedSegments().push_back(load_segment.get());

  return HSA_STATUS_SUCCESS;
 }
@@ -1398,7 +1388,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
  if (s->memSize() == 0)
    return HSA_STATUS_SUCCESS;
  amdgpu_hsa_elf_segment_t segment = (amdgpu_hsa_elf_segment_t)(s->type() - PT_LOOS);
-  Segment *new_seg = nullptr;
+  std::shared_ptr<Segment> new_seg;
  bool need_alloc = true;
  if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM && nullptr != program_allocation_segment) {
    new_seg = program_allocation_segment;
@@ -1407,7 +1397,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
  if (need_alloc) {
    void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true);
    if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
-    new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
+    new_seg = std::make_shared<Segment>(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
    new_seg->Copy(s->vaddr(), s->data(), s->imageSize());
    objects.push_back(new_seg);

@@ -1416,7 +1406,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent,
    }
  }
  assert(new_seg);
-  loaded_code_objects.back()->LoadedSegments().push_back(new_seg);
+  loaded_code_objects.back()->LoadedSegments().push_back(new_seg.get());
  return HSA_STATUS_SUCCESS;
 }

@@ -1471,7 +1461,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
  }

  uint64_t address = SymbolAddress(agent, sym);
-  SymbolImpl *symbol = nullptr;
+  std::shared_ptr<SymbolImpl> symbol;
  if (string_ends_with(sym->GetSymbolName(), ".kd")) {
    // V3.
    llvm::amdhsa::kernel_descriptor_t kd;
@@ -1486,7 +1476,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,

    uint64_t size = sym->Size();

-    KernelSymbol *kernel_symbol = new KernelSymbol(true,
+    std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
                                    sym->GetModuleName(),
                                    sym->GetSymbolName(),
                                    sym->Linkage(),
@@ -1502,7 +1492,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
                                    address);
    symbol = kernel_symbol;
  } else if (sym->IsVariableSymbol()) {
-    symbol = new VariableSymbol(true,
+    symbol = std::make_shared<VariableSymbol>(true,
                       sym->GetModuleName(),
                       sym->GetSymbolName(),
                       sym->Linkage(),
@@ -1537,7 +1527,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
        // calculate end of segment - symbol value.
        size = sym->GetSection()->size() - sym->SectionOffset();
      }
-      KernelSymbol *kernel_symbol = new KernelSymbol(true,
+      std::shared_ptr<KernelSymbol> kernel_symbol = std::make_shared<KernelSymbol>(true,
                                      sym->GetModuleName(),
                                      sym->GetSymbolName(),
                                      sym->Linkage(),
@@ -1970,7 +1960,7 @@ void ExecutableImpl::Print(std::ostream& out)
      << std::endl << std::endl;
  out << "Loaded Objects (total " << objects.size() << ")" << std::endl;
  size_t i = 0;
-  for (ExecutableObject* o : objects) {
+  for (const auto& o : objects) {
    out << "Loaded Object " << i++ << ": ";
    o->Print(out);
    out << std::endl;
@@ -461,7 +461,7 @@ public:
 };

 typedef std::string ProgramSymbol;
-typedef std::unordered_map<ProgramSymbol, SymbolImpl*> ProgramSymbolMap;
+typedef std::unordered_map<ProgramSymbol, std::shared_ptr<SymbolImpl>> ProgramSymbolMap;

 typedef std::pair<std::string, hsa_agent_t> AgentSymbol;
 struct ASC {
@@ -476,7 +476,7 @@ struct ASH {
    return h ^ (i << 1);
  }
 };
-typedef std::unordered_map<AgentSymbol, SymbolImpl*, ASH, ASC> AgentSymbolMap;
+typedef std::unordered_map<AgentSymbol, std::shared_ptr<SymbolImpl>, ASH, ASC> AgentSymbolMap;

 class ExecutableImpl final: public Executable {
 friend class AmdHsaCodeLoader;
@@ -634,15 +634,15 @@ private:

  ProgramSymbolMap program_symbols_;
  AgentSymbolMap agent_symbols_;
-  std::vector<ExecutableObject*> objects;
-  Segment *program_allocation_segment;
-  std::vector<LoadedCodeObjectImpl*> loaded_code_objects;
+  std::vector<std::shared_ptr<ExecutableObject>> objects;
+  std::shared_ptr<Segment> program_allocation_segment;
+  std::vector<std::shared_ptr<LoadedCodeObjectImpl>> loaded_code_objects;
 };

 class AmdHsaCodeLoader : public Loader {
 private:
  Context* context;
-  std::vector<Executable*> executables;
+  std::vector<std::shared_ptr<Executable>> executables;
  amd::hsa::common::ReaderWriterLock rw_lock_;

 public:
@@ -282,7 +282,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
    size_t interval, size_t latency, size_t buffer_size,
    hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data,
    hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);

  handle->handle = ++pc_sampling_id_;
  // create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size,
@@ -305,7 +305,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal(
 }

 hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
  auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
  if (pcSamplingSessionIt == pc_sampling_.end()) {
    debug_warning(false && "Cannot find PcSampling session");
@@ -319,7 +319,7 @@ hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
 }

 hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
  auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
  if (pcSamplingSessionIt == pc_sampling_.end()) {
    debug_warning(false && "Cannot find PcSampling session");
@@ -331,7 +331,7 @@ hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
 }

 hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
  auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
  if (pcSamplingSessionIt == pc_sampling_.end()) {
    debug_warning(false && "Cannot find PcSampling session");
@@ -343,7 +343,7 @@ hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
 }

 hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) {
-  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  std::lock_guard<std::mutex> lock(pc_sampling_lock_);
  auto pcSamplingSessionIt = pc_sampling_.find(static_cast<uint64_t>(handle.handle));
  if (pcSamplingSessionIt == pc_sampling_.end()) {
    debug_warning(false && "Cannot find PcSampling session");
@@ -166,7 +166,7 @@ class PcsRuntime {
 }
  // Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle
  std::map<uint64_t, PcSamplingSession> pc_sampling_;
-  KernelMutex pc_sampling_lock_;
+  std::mutex pc_sampling_lock_;
  uint64_t pc_sampling_id_;

  DISALLOW_COPY_AND_ASSIGN(PcsRuntime);