From 637b0d71f0ea7da409d7126b5828cc1982f02d92 Mon Sep 17 00:00:00 2001 From: pghoshamd Date: Tue, 6 Jan 2026 10:59:34 -0500 Subject: [PATCH] SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers (#2146) * SWDEV-569319 Replace ScopedAcquire with stdcpp wrappers * Remove KernelMutex and KernelSharedMutex abstractions with std::mutex and std::shared_mutex * Replaced unique_locks with lock_guards * More changes * Replace new and deletes with smart pointers * Replaced some more with shared ptrs * Replacements with smart pointers - pt 2 * missed change --- .../runtime/hsa-runtime/core/inc/agent.h | 7 +- .../hsa-runtime/core/inc/amd_aie_agent.h | 4 +- .../hsa-runtime/core/inc/amd_aql_queue.h | 10 +- .../hsa-runtime/core/inc/amd_blit_sdma.h | 2 +- .../hsa-runtime/core/inc/amd_cpu_agent.h | 6 +- .../hsa-runtime/core/inc/amd_gpu_agent.h | 22 +-- .../hsa-runtime/core/inc/amd_hsa_code.hpp | 10 +- .../hsa-runtime/core/inc/amd_hsa_loader.hpp | 2 +- .../hsa-runtime/core/inc/amd_memory_region.h | 2 +- .../hsa-runtime/core/inc/intercept_queue.h | 2 +- .../runtime/hsa-runtime/core/inc/ipc_signal.h | 2 +- .../runtime/hsa-runtime/core/inc/runtime.h | 40 +++-- .../runtime/hsa-runtime/core/inc/signal.h | 3 +- .../core/runtime/amd_aie_agent.cpp | 22 +-- .../core/runtime/amd_aql_queue.cpp | 22 +-- .../core/runtime/amd_blit_sdma.cpp | 4 +- .../core/runtime/amd_cpu_agent.cpp | 15 +- .../core/runtime/amd_gpu_agent.cpp | 83 +++++----- .../core/runtime/amd_loader_context.cpp | 10 +- .../core/runtime/amd_memory_region.cpp | 13 +- .../runtime/hsa-runtime/core/runtime/hsa.cpp | 10 +- .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 4 +- .../core/runtime/intercept_queue.cpp | 2 +- .../core/runtime/interrupt_signal.cpp | 4 +- .../hsa-runtime/core/runtime/ipc_signal.cpp | 4 +- .../hsa-runtime/core/runtime/runtime.cpp | 119 +++++++------- .../hsa-runtime/core/runtime/signal.cpp | 14 +- .../runtime/hsa-runtime/core/util/lazy_ptr.h | 8 +- .../runtime/hsa-runtime/core/util/locks.h | 149 ++---------------- .../libamdhsacode/amd_hsa_code.cpp | 48 +++--- .../runtime/hsa-runtime/loader/executable.cpp | 104 ++++++------ .../runtime/hsa-runtime/loader/executable.hpp | 12 +- .../runtime/hsa-runtime/pcs/pcs_runtime.cpp | 10 +- .../runtime/hsa-runtime/pcs/pcs_runtime.h | 2 +- 34 files changed, 319 insertions(+), 452 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h index fb8f89c118..ee7ec26a8b 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h @@ -47,6 +47,7 @@ #include #include +#include #include "core/inc/checked.h" #include "core/inc/isa.h" @@ -291,7 +292,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { void* value) const = 0; // @brief Returns an array of regions owned by the agent. - virtual const std::vector& regions() const = 0; + virtual const std::vector>& regions() const = 0; // @brief Returns the ISA's supported by the agent. // @details The returned vector is a list of pointers to the supported ISA, @@ -336,7 +337,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { __forceinline void Disable() { enabled_ = false; } virtual void Trim() { - for (auto region : regions()) region->Trim(); + for (const auto& region : regions()) region.get()->Trim(); } virtual void ReleaseResources() { } @@ -385,7 +386,7 @@ protected: // Serial memory operations are needed to ensure, among other things, that allocation failures are // due to true OOM conditions and per region caching (Trim and Allocate must be serial and // exclusive to ensure this). - KernelMutex agent_memory_lock_; + std::mutex agent_memory_lock_; // Forbid copying and moving of this object DISALLOW_COPY_AND_ASSIGN(Agent); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h index a9c77fa18d..c5dd4ccb19 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h @@ -82,7 +82,7 @@ public: /// @brief Override from core::Agent. const std::vector& supported_isas() const override { return supported_isas_; } - const std::vector& regions() const override { return regions_; } + const std::vector>& regions() const override { return regions_; } /// @brief Getter for the AIE system allocator. const std::function& @@ -101,7 +101,7 @@ private: /// @brief Setup the memory allocators used by this agent. void InitAllocators(); - std::vector regions_; + std::vector> regions_; std::function system_allocator_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h index 1f0bfa1215..ef80f69776 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -306,7 +306,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo // GPU-visible indirect buffer holding PM4 commands. void* pm4_ib_buf_; uint32_t pm4_ib_size_b_; - KernelMutex pm4_ib_mutex_; + std::mutex pm4_ib_mutex_; // Error handler control variable. std::atomic dynamicScratchState, exceptionState; @@ -322,11 +322,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo Signal* exception_signal_; // CU mask lock - KernelMutex mask_lock_; + std::mutex mask_lock_; // Mutex to prevent AsyncReclaimScratch and HandleInsufficientScratch from // happening at the same time. - KernelMutex scratch_lock_; + std::mutex scratch_lock_; // Current CU mask std::vector cu_mask_; @@ -345,10 +345,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo } // Mutex for queue_event_ manipulation -KernelMutex& queue_lock() { +std::mutex& queue_lock() { // This allocation is meant to last until the last thread has exited. // It is intentionally not freed. - static KernelMutex* queue_lock_ = new KernelMutex(); + static std::mutex* queue_lock_ = new std::mutex(); return *queue_lock_; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 7d820c8437..dfc9ed1006 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -255,7 +255,7 @@ template class BlitSdma : public BlitSdmaBase { // Internal signals for blocking APIs core::unique_signal_ptr signals_[2]; - KernelMutex lock_; + std::mutex lock_; bool parity_; /// Queue resource descriptor for doorbell, read diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h index bfa080cf8c..26c72136c1 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h @@ -127,7 +127,7 @@ class CpuAgent : public core::Agent { } // @brief Override from core::Agent. - const std::vector& regions() const override { + const std::vector>& regions() const override { return regions_; } @@ -151,7 +151,7 @@ class CpuAgent : public core::Agent { // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed // region returns ::HSA_STATUS_SUCCESS. hsa_status_t VisitRegion( - const std::vector& regions, + const std::vector>& regions, hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const; @@ -166,7 +166,7 @@ class CpuAgent : public core::Agent { std::vector> caches_; // @brief Array of regions owned by this agent. - std::vector regions_; + std::vector> regions_; DISALLOW_COPY_AND_ASSIGN(CpuAgent); }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index d49c4fdd8a..c799cd8611 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -394,7 +394,7 @@ class GpuAgent : public GpuAgentInt { } // @brief Override from core::Agent. - const std::vector& regions() const override { + const std::vector>& regions() const override { return regions_; } @@ -536,7 +536,7 @@ class GpuAgent : public GpuAgentInt { // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed // region returns ::HSA_STATUS_SUCCESS. hsa_status_t VisitRegion( - const std::vector& regions, + const std::vector>& regions, hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const; @@ -594,7 +594,7 @@ class GpuAgent : public GpuAgentInt { std::vector xgmi_peer_list_; // Protects xgmi_peer_list_ - KernelMutex xgmi_peer_list_lock_; + std::mutex xgmi_peer_list_lock_; // @brief AQL queues for cache management and blit compute usage. enum QueueEnum { @@ -607,19 +607,19 @@ class GpuAgent : public GpuAgentInt { lazy_ptr queues_[QueueCount]; // @brief Mutex to protect the update to coherency type. - KernelMutex coherency_lock_; + std::mutex coherency_lock_; // @brief Mutex to protect access to scratch pool. - KernelMutex scratch_lock_; + std::mutex scratch_lock_; // @brief Mutex to protect access to ::t1_. - KernelMutex t1_lock_; + std::mutex t1_lock_; // @brief Mutex to protect access to blit objects. - KernelMutex blit_lock_; + std::mutex blit_lock_; // @brief Mutex to protect sdma gang submissions. - KernelMutex sdma_gang_lock_; + std::mutex sdma_gang_lock_; // @brief GPU tick on initialization. HsaClockCounters t0_; @@ -638,7 +638,7 @@ class GpuAgent : public GpuAgentInt { std::vector> caches_; // @brief Array of regions owned by this agent. - std::vector regions_; + std::vector> regions_; core::Isa* isa_; @@ -729,7 +729,7 @@ class GpuAgent : public GpuAgentInt { struct { lazy_ptr queue_; int ref_ct_; - KernelMutex lock_; + std::mutex lock_; } gws_queue_; // @brief list of AQL queues owned by this agent. Indexed by queue pointer @@ -763,7 +763,7 @@ class GpuAgent : public GpuAgentInt { /// @brief Coarse-grain deallocator on this GPU. std::function coarsegrain_deallocator_; - void* trap_handler_tma_region_; + std::unique_ptr> trap_handler_tma_region_; /* PC Sampling fields - begin */ /* 2nd level Trap handler code is based on the offsets within this structure */ diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp index d44f4d095b..7027ee3364 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp @@ -181,7 +181,7 @@ namespace code { std::vector dataSegments; std::vector dataSections; std::vector relocationSections; - std::vector symbols; + std::vector> symbols; bool combineDataSegments; Segment* hsaSegments[AMDGPU_HSA_SEGMENT_LAST][2]; Section* hsaSections[AMDGPU_HSA_SECTION_LAST]; @@ -234,7 +234,7 @@ namespace code { uint32_t OsAbi() const { return img->OsAbi(); } AmdHsaCode(bool combineDataSegments = true); - virtual ~AmdHsaCode(); + virtual ~AmdHsaCode() = default; std::string output() { return out.str(); } bool LoadFromFile(const std::string& filename); @@ -347,7 +347,7 @@ namespace code { RelocationSection* GetRelocationSection(size_t i) { return relocationSections[i]; } size_t SymbolCount() { return symbols.size(); } - Symbol* GetSymbol(size_t i) { return symbols[i]; } + Symbol* GetSymbol(size_t i) { return symbols[i].get(); } Symbol* GetSymbolByElfIndex(size_t index); Symbol* FindSymbol(const std::string &n); @@ -362,11 +362,11 @@ namespace code { class AmdHsaCodeManager { private: - typedef std::unordered_map CodeMap; + typedef std::unordered_map> CodeMap; CodeMap codeMap; public: - AmdHsaCode* FromHandle(hsa_code_object_t handle); + const std::shared_ptr& FromHandle(hsa_code_object_t handle); bool Destroy(hsa_code_object_t handle); }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp index 5625e2e1a6..3460b83f24 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp @@ -422,7 +422,7 @@ private: Executable(const Executable &e); Executable& operator=(const Executable &e); - static std::vector executables; + static std::vector> executables; static std::mutex executables_mutex; }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h index 82b110d70d..d52ba26ddc 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -187,7 +187,7 @@ private: // Protects against concurrent allow_access calls to fragments of the same block by virtue of all // fragments of the block routing to the same MemoryRegion. - mutable KernelMutex access_lock_; + mutable std::mutex access_lock_; static const size_t kPageSize_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h index 95d7259b30..f7e1d18fec 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h @@ -216,7 +216,7 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi private: // Serialize packet interception processing. - KernelMutex lock_; + std::mutex lock_; // Largest processed packet index. uint64_t next_packet_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h index 0d2e0ae445..8d565093b3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/ipc_signal.h @@ -103,7 +103,7 @@ class IPCSignal : private SharedMemorySignal, public BusyWaitSignal { static int rtti_id_ = 0; return rtti_id_; } - static KernelMutex lock_; + static std::mutex lock_; explicit IPCSignal(SharedMemorySignal&& abi_block) : SharedMemorySignal(std::move(abi_block)), BusyWaitSignal(signal(), true) {} diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h index 800bc94ca5..4aa92ae95d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h @@ -51,6 +51,7 @@ #include #include #include +#include #if defined(__linux__) #include #include @@ -437,15 +438,15 @@ class Runtime { Agent* region_gpu() { return region_gpu_; } - const std::vector& system_regions_fine() const { + const std::vector>& system_regions_fine() const { return system_regions_fine_; } - const std::vector& system_regions_coarse() const { + const std::vector>& system_regions_coarse() const { return system_regions_coarse_; } - amd::hsa::loader::Loader* loader() { return loader_; } + amd::hsa::loader::Loader* loader() { return loader_.get(); } amd::LoaderContext* loader_context() { return &loader_context_; } @@ -719,10 +720,10 @@ class Runtime { // Will be created before any user could call hsa_init but also could be // destroyed before incorrectly written programs call hsa_shutdown. - static __forceinline KernelMutex& bootstrap_lock() { + static __forceinline std::mutex& bootstrap_lock() { // This allocation is meant to last until the last thread has exited. // It is intentionally not freed. - static KernelMutex* bootstrap_lock_ = new KernelMutex; + static std::mutex* bootstrap_lock_ = new std::mutex; return *bootstrap_lock_; } Runtime(); @@ -780,7 +781,7 @@ class Runtime { // Also ensures atomicity of pointer info queries by interlocking // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo // registered & mapped arrays. - KernelSharedMutex memory_lock_; + std::shared_mutex memory_lock_; // Array containing driver interfaces for compatible agent kernel-mode // drivers. Currently supports AIE agents. @@ -811,16 +812,16 @@ class Runtime { std::vector gpu_ids_; // List of all fine grain system memory region in the platform. - std::vector system_regions_fine_; + std::vector> system_regions_fine_; // List of all coarse grain system memory region in the platform. - std::vector system_regions_coarse_; + std::vector> system_regions_coarse_; // Matrix of IO link. std::vector link_matrix_; // Loader instance. - amd::hsa::loader::Loader* loader_; + std::unique_ptr loader_; // Loader context. amd::LoaderContext loader_context_; @@ -832,7 +833,7 @@ class Runtime { std::map allocation_map_; // Pending prefetch containers. - KernelMutex prefetch_lock_; + std::mutex prefetch_lock_; prefetch_map_t prefetch_map_; // Allocator using ::system_region_ @@ -853,24 +854,29 @@ class Runtime { // Number of Numa Nodes size_t num_nodes_; + struct HsaEventDeleter { + void operator()(HsaEvent* event) { InterruptSignal::DestroyEvent(event); } + }; + using unique_hsa_event_ptr = std::unique_ptr; + // @brief AMD HSA event to monitor for virtual memory access fault. - HsaEvent* vm_fault_event_; + unique_hsa_event_ptr vm_fault_event_; // @brief HSA signal to contain the VM fault event. - Signal* vm_fault_signal_; + unique_signal_ptr vm_fault_signal_; // @brief AMD HSA event to monitor for HW exceptions. - HsaEvent* hw_exception_event_; + unique_hsa_event_ptr hw_exception_event_; // @brief HSA signal to contain the HW exceptionevent. - Signal* hw_exception_signal_; + unique_signal_ptr hw_exception_signal_; // Custom system event handlers. std::vector, void*>> system_event_handlers_; // System event handler lock - KernelMutex system_event_lock_; + std::mutex system_event_lock_; // Internal queue creation notifier AMD::callback_t internal_queue_create_notifier_; @@ -898,8 +904,8 @@ class Runtime { // IPC DMA buf unix domain socket server dmabuf FD passing int ipc_sock_server_fd_; - std::map ipc_sock_server_conns_; - KernelMutex ipc_sock_server_lock_; + std::map ipc_sock_server_conns_; + std::mutex ipc_sock_server_lock_; private: void CheckVirtualMemApiSupport(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h index 4647604223..1dd9260ae6 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/signal.h @@ -50,6 +50,7 @@ #include #include #include +#include #include "hsakmt/hsakmt.h" @@ -499,7 +500,7 @@ class Signal { core::Agent* async_copy_agent_; private: - static KernelMutex ipcLock_; + static std::mutex ipcLock_; static std::map ipcMap_; static Signal* lookupIpc(hsa_signal_t signal); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp index c66273ebb8..af7b7a1bf5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp @@ -66,7 +66,6 @@ AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props) } AieAgent::~AieAgent() { - std::for_each(regions_.begin(), regions_.end(), DeleteObject()); regions_.clear(); } @@ -75,8 +74,8 @@ hsa_status_t AieAgent::VisitRegion(bool include_peer, void *data), void *data) const { AMD::callback_t call(callback); - for (const auto r : regions_) { - hsa_region_t region_handle(core::MemoryRegion::Convert(r)); + for (const auto& r : regions_) { + hsa_region_t region_handle(core::MemoryRegion::Convert(r.get())); hsa_status_t err = call(region_handle, data); if (err != HSA_STATUS_SUCCESS) { return err; @@ -321,24 +320,25 @@ void AieAgent::InitRegionList() { /// explicit sync operations. regions_.reserve(3); regions_.push_back( - new MemoryRegion(false, true, false, false, true, this, sys_mem_props)); + std::make_shared(false, true, false, false, true, this, sys_mem_props)); regions_.push_back( - new MemoryRegion(false, false, false, false, true, this, dev_mem_props)); - regions_.push_back(new MemoryRegion(false, false, false, false, true, this, - other_mem_props)); + std::make_shared(false, false, false, false, true, this, dev_mem_props)); + regions_.push_back( + std::make_shared(false, false, false, false, true, this, other_mem_props)); } void AieAgent::InitAllocators() { - for (const auto *region : regions()) { + for (const auto& region : regions()) { const MemoryRegion *amd_mem_region( - static_cast(region)); + static_cast(region.get())); if (amd_mem_region->kernarg()) { + const core::MemoryRegion* region_ptr = region.get(); system_allocator_ = - [region](size_t size, size_t align, + [region_ptr](size_t size, size_t align, core::MemoryRegion::AllocateFlags alloc_flags) -> void * { void *mem(nullptr); return (core::Runtime::runtime_singleton_->AllocateMemory( - region, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS) + region_ptr, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS) ? mem : nullptr; }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 396edeff7e..ff05e8ecc1 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -165,8 +165,8 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_ // Set group and private memory apertures in amd_queue_. auto& regions = agent->regions(); - for (auto region : regions) { - const MemoryRegion* amdregion = static_cast(region); + for (const auto& region : regions) { + const MemoryRegion* amdregion = static_cast(region.get()); uint64_t base = amdregion->GetBaseAddress(); if (amdregion->IsLDS()) { @@ -217,7 +217,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_ } MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() { - ScopedAcquire _lock(&queue_lock()); + std::lock_guard _lock(queue_lock()); queue_count()--; if (queue_count() == 0) { core::InterruptSignal::DestroyEvent(queue_event()); @@ -232,7 +232,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_ }); if (core::g_use_interrupt_wait) { - ScopedAcquire _lock(&queue_lock()); + std::lock_guard _lock(queue_lock()); queue_count()++; if (queue_event() == nullptr) { assert(queue_count() == 1 && "Inconsistency in queue event reference counting found.\n"); @@ -387,7 +387,7 @@ AqlQueue::~AqlQueue() { FreeQueueMemory(); if (core::g_use_interrupt_wait) { - ScopedAcquire lock(&queue_lock()); + std::lock_guard lock(queue_lock()); queue_count()--; if (queue_count() == 0) { core::InterruptSignal::DestroyEvent(queue_event()); @@ -777,7 +777,7 @@ void AqlQueue::AsyncReclaimMainScratch() { tool::notify_event_scratch_async_reclaim_start(public_handle(), HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE); - ScopedAcquire lock(&scratch_lock_); + std::lock_guard lock(scratch_lock_); // Unmap the queue. CP will check amd_queue_ fields on re-map Suspend(); @@ -849,7 +849,7 @@ void AqlQueue::AsyncReclaimAltScratch() { tool::notify_event_scratch_async_reclaim_start(public_handle(), HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT); - ScopedAcquire lock(&scratch_lock_); + std::lock_guard lock(scratch_lock_); // Unmap the queue. CP will check amd_queue_ fields on re-map Suspend(); @@ -1014,7 +1014,7 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code, const uint64_t device_size = size_per_thread * lanes_per_wave * device_slots; const uint64_t dispatch_size = size_per_thread * lanes_per_wave * dispatch_slots; - ScopedAcquire lock(&scratch_lock_); + std::lock_guard lock(scratch_lock_); // scratch.use_alt_limit will be 0 if alt scratch is not supported or disabled if (dispatch_size < scratch.use_alt_limit && dispatch_slots < device_slots) { @@ -1393,7 +1393,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask; // Apply mask if non-default or not queue initialization. - ScopedAcquire lock(&mask_lock_); + std::lock_guard lock(mask_lock_); if ((!cu_mask_.empty()) || (num_cu_mask_count != 0) || (!global_mask.empty())) { // Devices with WGPs must conform to even-indexed contiguous pairwise CU enablement. @@ -1414,7 +1414,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* } hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) { - ScopedAcquire lock(&mask_lock_); + std::lock_guard lock(mask_lock_); assert(!cu_mask_.empty() && "No current cu_mask!"); uint32_t user_dword_count = num_cu_mask_count / 32; @@ -1440,7 +1440,7 @@ void AqlQueue::SetProfiling(bool enabled) { void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope_t acquireFence, hsa_fence_scope_t releaseFence, hsa_signal_t* in_signal) { // pm4_ib_buf_ is a shared resource, so mutually exclude here. - ScopedAcquire lock(&pm4_ib_mutex_); + std::lock_guard lock(pm4_ib_mutex_); // Obtain reference to any container queue. core::Queue* queue = core::Queue::Convert(public_handle()); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index 3067f827d3..d1681f968a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -293,7 +293,7 @@ static bool DepSignalCompleteHandler(hsa_signal_value_t signal_value, void *arg template hsa_status_t BlitSdma::SubmitBlockingCommand(const void* cmd, size_t cmd_size, uint64_t size) { - ScopedAcquire lock(&lock_); + std::unique_lock lock(lock_); // Alternate between completion signals // Using two allows overlapping command writing and copies @@ -310,7 +310,7 @@ hsa_status_t BlitSdma::SubmitBlockingCommand(const void* cmd, size_t cmd // Mark signal as in use, guard against exception leaving the signal in an unusable state. completionSignal->StoreRelaxed(2); MAKE_SCOPE_GUARD([&]() { completionSignal->StoreRelaxed(0); }); - lock.Release(); + lock.unlock(); std::vector gang_signals(0); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp index 37eda03d85..5ad9730344 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp @@ -64,7 +64,6 @@ CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, } CpuAgent::~CpuAgent() { - std::for_each(regions_.begin(), regions_.end(), DeleteObject()); regions_.clear(); } @@ -87,17 +86,17 @@ void CpuAgent::InitRegionList() { if (system_prop != mem_props.end()) system_props = *system_prop; // Fine-Grain Memory - regions_.push_back(new MemoryRegion(true, false, is_apu_node, false, true, this, system_props)); + regions_.push_back(std::make_shared(true, false, is_apu_node, false, true, this, system_props)); // Ext-Fine-Grain Memory - regions_.push_back(new MemoryRegion(false, false, is_apu_node, true, true, this, system_props)); + regions_.push_back(std::make_shared(false, false, is_apu_node, true, true, this, system_props)); // Kernargs - regions_.push_back(new MemoryRegion(true, true, is_apu_node, false, true, this, system_props)); + regions_.push_back(std::make_shared(true, true, is_apu_node, false, true, this, system_props)); if (!is_apu_node) { // Coarse Grain - regions_.push_back(new MemoryRegion(false, false, is_apu_node, false, true, this, system_props)); + regions_.push_back(std::make_shared(false, false, is_apu_node, false, true, this, system_props)); } } } @@ -150,12 +149,12 @@ hsa_status_t CpuAgent::VisitRegion(bool include_peer, } hsa_status_t CpuAgent::VisitRegion( - const std::vector& regions, + const std::vector>& regions, hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const { - for (const core::MemoryRegion* region : regions) { + for (const std::shared_ptr& region : regions) { if (!region->user_visible()) continue; - hsa_region_t region_handle = core::MemoryRegion::Convert(region); + hsa_region_t region_handle = core::MemoryRegion::Convert(region.get()); hsa_status_t status = callback(region_handle, data); if (status != HSA_STATUS_SUCCESS) { return status; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 2537bb8256..cf4e90a5ab 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -112,7 +112,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna scratch_limit_async_threshold_(0), scratch_cache_( [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }), - trap_handler_tma_region_(NULL), + trap_handler_tma_region_(nullptr, [this](void* ptr){ + if (ptr && this->finegrain_allocator_) this->finegrain_deallocator()(ptr); + }), rec_sdma_eng_override_(false), pcs_hosttrap_data_(), pcs_stochastic_data_(), @@ -246,7 +248,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna GpuAgent::~GpuAgent() { for (auto& blit : blits_) blit.reset(); - std::for_each(regions_.begin(), regions_.end(), DeleteObject()); regions_.clear(); } @@ -454,22 +455,20 @@ void GpuAgent::InitRegionList() { memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax; case HSA_HEAPTYPE_GPU_LDS: case HSA_HEAPTYPE_GPU_SCRATCH: { - MemoryRegion* region = - new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]); - + std::shared_ptr region = std::make_shared(false, false, false, false, true, this, mem_props[mem_idx]); regions_.push_back(region); if (region->IsLocalMemory()) { // Extended Fine-Grain memory if (!(isa_->GetMajorVersion() == 12 && isa_->GetMinorVersion() == 0)) regions_.push_back( - new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx])); + std::make_shared(false, false, false, true, true, this, mem_props[mem_idx])); // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. bool user_visible = (properties_.HiveID != 0) || core::Runtime::runtime_singleton_->flag().fine_grain_pcie(); - regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this, + regions_.push_back(std::make_shared(true, false, false, false, user_visible, this, mem_props[mem_idx])); } break; @@ -561,7 +560,7 @@ void GpuAgent::ReserveScratch() size_t available; hsa_status_t err = driver().AvailableMemory(node_id(), &available); assert(err == HSA_STATUS_SUCCESS && "AvailableMemory failed"); - ScopedAcquire lock(&scratch_lock_); + std::lock_guard lock(scratch_lock_); if (!scratch_cache_.reserved_bytes() && reserved_sz && available > 8 * reserved_sz) { HSAuint64 alt_va; void* reserved_base = scratch_pool_.alloc(reserved_sz); @@ -676,20 +675,20 @@ hsa_status_t GpuAgent::VisitRegion(bool include_peer, } hsa_status_t GpuAgent::VisitRegion( - const std::vector& regions, + const std::vector>& regions, hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const { AMD::callback_t call(callback); - for (const core::MemoryRegion* region : regions) { + for (const auto& region : regions) { if (!region->user_visible()) continue; const AMD::MemoryRegion* amd_region = - reinterpret_cast(region); + reinterpret_cast(region.get()); // Only expose system, local, and LDS memory. if (amd_region->IsSystem() || amd_region->IsLocalMemory() || amd_region->IsLDS()) { - hsa_region_t region_handle = core::MemoryRegion::Convert(region); + hsa_region_t region_handle = core::MemoryRegion::Convert(region.get()); hsa_status_t status = call(region_handle, data); if (status != HSA_STATUS_SUCCESS) { return status; @@ -910,7 +909,7 @@ void GpuAgent::InitGWS() { } void GpuAgent::GWSRelease() { - ScopedAcquire lock(&gws_queue_.lock_); + std::lock_guard lock(gws_queue_.lock_); gws_queue_.ref_ct_--; if (gws_queue_.ref_ct_ != 0) return; InitGWS(); @@ -968,22 +967,22 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) { } void GpuAgent::SetCopyRequestRefCount(bool set) { - ScopedAcquire lock(&blit_lock_); + std::unique_lock lock(blit_lock_); while (pending_copy_stat_check_ref_) { - blit_lock_.Release(); + lock.unlock(); os::YieldThread(); - blit_lock_.Acquire(); + lock.lock(); } if (!set && pending_copy_req_ref_) pending_copy_req_ref_--; else pending_copy_req_ref_++; } void GpuAgent::SetCopyStatusCheckRefCount(bool set) { - ScopedAcquire lock(&blit_lock_); + std::unique_lock lock(blit_lock_); while (pending_copy_req_ref_) { - blit_lock_.Release(); + lock.unlock(); os::YieldThread(); - blit_lock_.Acquire(); + lock.lock(); } if (!set && pending_copy_stat_check_ref_) pending_copy_stat_check_ref_--; else pending_copy_stat_check_ref_++; @@ -1059,7 +1058,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, std::min(gang_factor, properties_.NumSdmaXgmiEngines); } - ScopedAcquire lock(&sdma_gang_lock_); + std::lock_guard lock(sdma_gang_lock_); // Manage internal gang signals std::vector gang_signals; if (gang_factor > 1) { @@ -1642,7 +1641,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { if (status != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT; - for (auto r : regions()) availableBytes += ((AMD::MemoryRegion*)r)->GetCacheSize(); + for (const auto& r : regions()) availableBytes += ((AMD::MemoryRegion*)(r.get()))->GetCacheSize(); availableBytes += scratch_cache_.free_bytes() - scratch_cache_.reserved_bytes(); @@ -1730,7 +1729,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u core::Queue** queue) { // Handle GWS queues. if (queue_type == HSA_QUEUE_TYPE_COOPERATIVE) { - ScopedAcquire lock(&gws_queue_.lock_); + std::lock_guard lock(gws_queue_.lock_); auto ret = (*gws_queue_.queue_).get(); if (ret != nullptr) { gws_queue_.ref_ct_++; @@ -1876,7 +1875,7 @@ void GpuAgent::AcquireQueueMainScratch(ScratchInfo& scratch) { */ bool large; - ScopedAcquire lock(&scratch_lock_); + std::lock_guard lock(scratch_lock_); const size_t small_limit = scratch_pool_.size() >> 3; bool use_reclaim = true; @@ -2035,7 +2034,7 @@ void GpuAgent::AcquireQueueAltScratch(ScratchInfo& scratch) { uint64_t size_per_wave = AlignUp(scratch.alt_size_per_thread * properties_.WaveFrontSize, 1024); if (size_per_wave > MAX_WAVE_SCRATCH) return; - ScopedAcquire lock(&scratch_lock_); + std::lock_guard lock(scratch_lock_); // Ensure mapping will be in whole pages. scratch.alt_size = AlignUp(scratch.alt_size, 4096); @@ -2176,7 +2175,7 @@ uint64_t GpuAgent::TranslateTime(uint64_t tick) { // Limit errors due to relative frequency drift to ~0.5us. Sync clocks at 16Hz. const int64_t max_extrapolation = core::Runtime::runtime_singleton_->sys_clock_freq() >> 4; - ScopedAcquire lock(&t1_lock_); + std::lock_guard lock(t1_lock_); // Limit errors due to correlated pair certainty to ~0.5us. // extrapolated time < (0.5us / half clock read certainty) * delay between clock measures // clock read certainty is <4us. @@ -2261,26 +2260,27 @@ hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttra ((uint64_t*)tma_region_host)[1] = (uint64_t)pcs_stochastic_buffers; if (!trap_handler_tma_region_) { - trap_handler_tma_region_ = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0); - if (trap_handler_tma_region_ == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + void* mem = (uint64_t*)finegrain_allocator()(2 * sizeof(uint64_t), 0); + if (!mem) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + trap_handler_tma_region_.reset(mem); // NearestCpuAgent owns pool returned system_allocator() auto cpuAgent = GetNearestCpuAgent()->public_handle(); hsa_status_t ret = - AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_); + AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_.get()); assert(ret == HSA_STATUS_SUCCESS); } /* On non-large BAR systems, we may not be able to access device memory, so do a DmaCopy */ - if (DmaCopy(trap_handler_tma_region_, tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS) + if (DmaCopy(trap_handler_tma_region_.get(), tma_region_host, 2 * sizeof(uint64_t)) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; tma_size = 2 * sizeof(uint64_t); - tma_addr = trap_handler_tma_region_; + tma_addr = trap_handler_tma_region_.get(); } else if (trap_handler_tma_region_) { - finegrain_deallocator()(trap_handler_tma_region_); - trap_handler_tma_region_ = NULL; + trap_handler_tma_region_.reset(nullptr); } // Bind the trap handler to this node. @@ -2398,7 +2398,7 @@ lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines; assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen")); - ScopedAcquire lock(&xgmi_peer_list_lock_); + std::lock_guard lock(xgmi_peer_list_lock_); for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) { uint64_t dst_handle = dst_agent.public_handle().handle; @@ -2490,19 +2490,20 @@ lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, void GpuAgent::Trim() { Agent::Trim(); AsyncReclaimScratchQueues(); - ScopedAcquire lock(&scratch_lock_); + std::lock_guard lock(scratch_lock_); scratch_cache_.trim(false); } void GpuAgent::InitAllocators() { - for (auto pool : GetNearestCpuAgent()->regions()) { + for (const auto& pool : GetNearestCpuAgent()->regions()) { if (pool->kernarg()) { - system_allocator_ = [pool](size_t size, size_t alignment, + const core::MemoryRegion* pool_ptr = pool.get(); + system_allocator_ = [pool_ptr](size_t size, size_t alignment, MemoryRegion::AllocateFlags alloc_flags) -> void* { assert(alignment <= 4096); void* ptr = nullptr; return (HSA_STATUS_SUCCESS == - core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, &ptr)) + core::Runtime::runtime_singleton_->AllocateMemory(pool_ptr, size, alloc_flags, &ptr)) ? ptr : nullptr; }; @@ -2513,14 +2514,14 @@ void GpuAgent::InitAllocators() { assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool."); // Setup this GPU's fine-grain and coarse-grain allocators. - for (auto region : regions()) { - const AMD::MemoryRegion* amd_region = static_cast(region); + for (const auto& region : regions()) { + const AMD::MemoryRegion* amd_region = static_cast(region.get()); - auto region_allocator = [region](size_t size, + auto region_allocator = [amd_region](size_t size, MemoryRegion::AllocateFlags alloc_flags) -> void* { void* ptr = nullptr; return (HSA_STATUS_SUCCESS == - core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr)) + core::Runtime::runtime_singleton_->AllocateMemory(amd_region, size, alloc_flags, &ptr)) ? ptr : nullptr; }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp index 51ad5dc04a..a76005656f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp @@ -283,18 +283,18 @@ const core::MemoryRegion* RegionMemory::AgentLocal(hsa_agent_t agent, bool is_co assert(amd_agent->device_type() == core::Agent::kAmdGpuDevice && "Invalid agent type."); auto agent_local_region = std::find_if(amd_agent->regions().begin(), amd_agent->regions().end(), - [&](const core::MemoryRegion* region) { - const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region; + [&](const std::shared_ptr& region) { + const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region.get(); return amd_region->IsLocalMemory() && (!amd_region->fine_grain()); }); - return agent_local_region == amd_agent->regions().end() ? nullptr : *agent_local_region; + return agent_local_region == amd_agent->regions().end() ? nullptr : agent_local_region->get(); } const core::MemoryRegion* RegionMemory::System(bool is_code) { if (is_code) - return core::Runtime::runtime_singleton_->system_regions_coarse()[0]; + return core::Runtime::runtime_singleton_->system_regions_coarse()[0].get(); else - return core::Runtime::runtime_singleton_->system_regions_fine()[0]; + return core::Runtime::runtime_singleton_->system_regions_fine()[0].get(); } bool RegionMemory::Allocate(size_t size, size_t align, bool zero) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index 842ef96165..dce3912d58 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -48,6 +48,8 @@ #include "core/inc/amd_memory_region.h" #include +#include +#include #include "core/inc/runtime.h" #include "core/inc/amd_cpu_agent.h" @@ -132,7 +134,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, MemoryRegion::~MemoryRegion() {} hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const { - ScopedAcquire lock(&owner()->agent_memory_lock_); + std::lock_guard lock(owner()->agent_memory_lock_); return AllocateImpl(size, alloc_flags, address, agent_node_id); } @@ -160,7 +162,7 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, } hsa_status_t MemoryRegion::Free(void* address, size_t size) const { - ScopedAcquire lock(&owner()->agent_memory_lock_); + std::lock_guard lock(owner()->agent_memory_lock_); return FreeImpl(address, size); } @@ -172,7 +174,7 @@ hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const { // TODO: Look into a better name and/or making this process transparent to exporting. hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const { - ScopedAcquire lock(&owner()->agent_memory_lock_); + std::lock_guard lock(owner()->agent_memory_lock_); if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION; return HSA_STATUS_SUCCESS; } @@ -448,7 +450,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, std::vector union_agents; info.size = sizeof(info); - ScopedAcquire lock(&access_lock_); + std::lock_guard lock(access_lock_); if (core::Runtime::runtime_singleton_->PtrInfo(const_cast(ptr), &info, malloc, &agent_count, &accessible, @@ -512,8 +514,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, { // Sequence with pointer info since queries to other fragments of the block may be adjusted by // this call. - ScopedAcquire lock( - core::Runtime::runtime_singleton_->memory_lock_.shared()); + std::shared_lock lock(core::Runtime::runtime_singleton_->memory_lock_); uint64_t alternate_va = 0; if (owner()->driver().MakeMemoryResident(ptr, size, &alternate_va, &map_flag, whitelist_nodes.size(), diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp index 147ce1ba6d..291a4382a9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp @@ -1804,7 +1804,7 @@ hsa_status_t hsa_code_object_serialize( IS_BAD_PTR(serialized_code_object); IS_BAD_PTR(serialized_code_object_size); - amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object); + amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get(); if (!code) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -1982,7 +1982,7 @@ hsa_status_t hsa_code_object_get_info( IS_OPEN(); IS_BAD_PTR(value); - amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object); + amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get(); if (!code) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -2039,7 +2039,7 @@ hsa_status_t hsa_code_object_get_symbol( IS_BAD_PTR(symbol_name); IS_BAD_PTR(symbol); - amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object); + amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get(); if (!code) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -2059,7 +2059,7 @@ hsa_status_t hsa_code_object_get_symbol_from_name( IS_BAD_PTR(symbol_name); IS_BAD_PTR(symbol); - amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object); + amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get(); if (!code) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -2097,7 +2097,7 @@ hsa_status_t hsa_code_object_iterate_symbols( IS_OPEN(); IS_BAD_PTR(callback); - amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object); + amd::hsa::code::AmdHsaCode *code = GetCodeManager()->FromHandle(code_object).get(); if (!code) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 9e71e71f94..b81c919eeb 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -759,7 +759,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size, } const AMD::MemoryRegion* system_region = static_cast( - core::Runtime::runtime_singleton_->system_regions_coarse()[0]); + core::Runtime::runtime_singleton_->system_regions_coarse()[0].get()); return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr); CATCH; @@ -799,7 +799,7 @@ hsa_status_t hsa_amd_memory_unlock(void* host_ptr) { const AMD::MemoryRegion* system_region = reinterpret_cast( - core::Runtime::runtime_singleton_->system_regions_fine()[0]); + core::Runtime::runtime_singleton_->system_regions_fine()[0].get()); return system_region->Unlock(host_ptr); CATCH; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp index ebae2fce0e..d1eef19208 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp @@ -340,7 +340,7 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) { return; } - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); // Submit overflow packets. if (!overflow_.empty()) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp index bf0d7179d7..c1b4b21b67 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp @@ -48,7 +48,7 @@ namespace rocr { namespace core { HsaEvent* InterruptSignal::EventPool::alloc() { - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); if (events_.empty()) { if (!allEventsAllocated) { HsaEvent* evt = InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false); @@ -64,7 +64,7 @@ HsaEvent* InterruptSignal::EventPool::alloc() { void InterruptSignal::EventPool::free(HsaEvent* evt) { if (evt == nullptr) return; - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); events_.push_back(unique_event_ptr(evt)); } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp index 2c22918053..c8e07fe2c0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/ipc_signal.cpp @@ -50,7 +50,7 @@ namespace rocr { namespace core { -KernelMutex IPCSignal::lock_; +std::mutex IPCSignal::lock_; SharedMemory::SharedMemory(const hsa_amd_ipc_memory_t* handle, size_t len) { hsa_status_t err = Runtime::runtime_singleton_->IPCAttach(handle, len, 0, NULL, &ptr_); @@ -85,7 +85,7 @@ Signal* IPCSignal::Attach(const hsa_amd_ipc_signal_t* ipc_signal_handle) { hsa_signal_t handle = SharedSignal::Convert(shared.signal()); - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); Signal* ret = core::Signal::DuplicateHandle(handle); if (ret == nullptr) ret = new IPCSignal(std::move(shared)); return ret; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index 3d34133c7f..254e7b80d8 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #if defined(__linux__) #include #include @@ -119,7 +120,7 @@ bool g_use_mwaitx; Runtime* Runtime::runtime_singleton_ = NULL; hsa_status_t Runtime::Acquire() { - ScopedAcquire boot(&bootstrap_lock()); + std::lock_guard boot(bootstrap_lock()); if (runtime_singleton_ == NULL) { memset(log_flags, 0, sizeof(log_flags)); @@ -146,7 +147,7 @@ hsa_status_t Runtime::Acquire() { } hsa_status_t Runtime::Release() { - ScopedAcquire boot(&bootstrap_lock()); + std::lock_guard boot(bootstrap_lock()); if (runtime_singleton_ == nullptr) return HSA_STATUS_ERROR_NOT_INITIALIZED; @@ -192,7 +193,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) { agents_by_gpuid_[0] = agent; // Add cpu regions to the system region list. - for (const core::MemoryRegion* region : agent->regions()) { + for (auto region : agent->regions()) { if (region->fine_grain()) { system_regions_fine_.push_back(region); } else { @@ -216,7 +217,7 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) { assert(alignment <= 4096); void* ptr = NULL; return (HSA_STATUS_SUCCESS == - core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, + core::Runtime::runtime_singleton_->AllocateMemory(pool.get(), size, alloc_flags, &ptr, agent_node_id)) ? ptr : NULL; @@ -336,7 +337,7 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size, hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id); // Track the allocation result so that it could be freed properly. if (status == HSA_STATUS_SUCCESS) { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); allocation_map_[*address] = AllocationRegion(region, size, size_requested, alloc_flags); } @@ -354,7 +355,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) { MemoryRegion::AllocateFlags alloc_flags = core::MemoryRegion::AllocateNoFlags; { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); std::map::iterator it = allocation_map_.find(ptr); @@ -458,7 +459,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) { hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_callback_t callback, void* user_data) { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); auto mem = allocation_map_.upper_bound(ptr); if (mem != allocation_map_.begin()) { mem--; @@ -482,7 +483,7 @@ hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_ca hsa_status_t Runtime::DeregisterReleaseNotifier(void* ptr, hsa_amd_deallocation_callback_t callback) { hsa_status_t ret = HSA_STATUS_ERROR_INVALID_ARGUMENT; - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); auto mem = allocation_map_.upper_bound(ptr); if (mem != allocation_map_.begin()) { mem--; @@ -552,7 +553,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { // GPU-CPU // Must ensure that system memory is visible to the GPU during the copy. const AMD::MemoryRegion* system_region = - static_cast(system_regions_fine_[0]); + static_cast(system_regions_fine_[0].get()); void* gpuPtr = nullptr; const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) { @@ -698,7 +699,7 @@ hsa_status_t Runtime::AllowAccess(uint32_t num_agents, size_t alloc_size = 0; { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); std::map::const_iterator it = allocation_map_.find(ptr); @@ -929,7 +930,7 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents, *size = info.SizeInBytes; *ptr = info.MemoryAddress; - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); allocation_map_[info.MemoryAddress] = AllocationRegion( nullptr, info.SizeInBytes, info.SizeInBytes, core::MemoryRegion::AllocateNoFlags); @@ -1055,7 +1056,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi { // memory_lock protects access to the NMappedNodes array and fragment user data since these may // change with calls to memory APIs. - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); if (VMemoryPtrInfo(ptr, &retInfo, alloc, num_agents_accessible, accessible) == HSA_STATUS_SUCCESS) { @@ -1196,7 +1197,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi hsa_status_t Runtime::SetPtrInfoData(const void* ptr, void* userptr) { { // Use allocation map if possible to handle fragments. - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); const auto& it = allocation_map_.find(ptr); if (it != allocation_map_.end()) { it->second.user_ptr = userptr; @@ -1307,7 +1308,7 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) { size_t len = 0; // Search for registered export pointer - ScopedAcquire lock(&ipc_sock_server_lock_); + std::lock_guard lock(ipc_sock_server_lock_); for (auto& conns : ipc_sock_server_conns_) { if (conn_handle == conns.first) { ptr = reinterpret_cast(conn_handle); @@ -1372,7 +1373,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han if (useFrag) { handle->handle[6] |= 0x80000000 | fragOffset; // Prevent realloction of fragment for better performance. - ScopedAcquire lock(memory_lock_.shared()); + std::shared_lock lock(memory_lock_); err = allocation_map_[ptr].region->IPCFragmentExport(ptr); assert(err == HSA_STATUS_SUCCESS && "Region inconsistent with address map."); } @@ -1439,7 +1440,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han close(dmabuf_fd); - ScopedAcquire lock(&ipc_sock_server_lock_); + std::lock_guard lock(ipc_sock_server_lock_); #if defined(__linux__) if (!ipc_sock_server_conns_.size()) { // create new runtime socket server struct sockaddr_un address; @@ -1549,7 +1550,7 @@ int Runtime::IPCClientImport(uint32_t conn_handle, uint64_t dmabuf_fd_handle, // Store the buffer object handle in allocation map for later use if (err == HSAKMT_STATUS_SUCCESS) { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); allocation_map_[*importAddress] = AllocationRegion(nullptr, *importSize, *importSize, core::MemoryRegion::AllocateNoFlags); allocation_map_[*importAddress].ldrm_bo = res.buf_handle; @@ -1579,7 +1580,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, importAddress = reinterpret_cast(importAddress) + fragOffset; len = Min(len, importSize - fragOffset); } - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); allocation_map_[importAddress] = AllocationRegion(nullptr, len, len, core::MemoryRegion::AllocateNoFlags); allocation_map_[importAddress].ldrm_bo = ldrm_bo; @@ -1711,7 +1712,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, hsa_status_t Runtime::IPCDetach(void* ptr) { bool ldrmImportCleaned = false; { // Handle imported fragments. - ScopedAcquire lock(&memory_lock_); + std::unique_lock lock(memory_lock_); const auto& it = allocation_map_.find(ptr); if (it != allocation_map_.end()) { if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT; @@ -1728,7 +1729,7 @@ hsa_status_t Runtime::IPCDetach(void* ptr) { assert(!"Unimplemented!"); #endif allocation_map_.erase(it); - lock.Release(); // Can't hold memory lock when using pointer info. + lock.unlock(); // Can't hold memory lock when using pointer info. PtrInfoBlockData block = {}; hsa_amd_pointer_info_t info = {}; @@ -1954,7 +1955,7 @@ void Runtime::AsyncEventsPool::clear() { } Runtime::AsyncEventItem* Runtime::AsyncEventsPool::alloc() { - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); if (free_list_.empty()) { AsyncEventItem* block = reinterpret_cast( allocate_()(block_size_ * sizeof(AsyncEventItem), __alignof(AsyncEventItem), core::MemoryRegion::AllocateNonPaged, 0)); @@ -1985,7 +1986,7 @@ void Runtime::AsyncEventsPool::free(AsyncEventItem* ptr) { if (ptr == nullptr) return; ptr->~AsyncEventItem(); - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); ifdebug { bool valid = false; @@ -2059,33 +2060,33 @@ void Runtime::BindErrorHandlers() { // Create memory event with manual reset to avoid racing condition // with driver in case of multiple concurrent VM faults. - vm_fault_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true); + vm_fault_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true)); // Create an interrupt signal object to contain the memory event. // This signal object will be registered with the async handler global // thread. - vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_); + vm_fault_signal_.reset(new core::InterruptSignal(0, vm_fault_event_.get())); if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) { assert(false && "Failed on creating VM fault signal"); return; } - SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0, - VMFaultHandler, reinterpret_cast(vm_fault_signal_)); + SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0, + VMFaultHandler, reinterpret_cast(vm_fault_signal_.get())); // Create HW exception event which is for Non-RAS events - hw_exception_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true); + hw_exception_event_.reset(core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_HW_EXCEPTION, true)); - hw_exception_signal_ = new core::InterruptSignal(0, hw_exception_event_); + hw_exception_signal_.reset(new core::InterruptSignal(0, hw_exception_event_.get())); if (!hw_exception_signal_->IsValid() || hw_exception_signal_->EopEvent() == NULL) { assert(false && "Failed on creating HW Exception signal"); return; } - SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_), HSA_SIGNAL_CONDITION_NE, 0, - HwExceptionHandler, reinterpret_cast(hw_exception_signal_)); + SetAsyncSignalHandler(core::Signal::Convert(hw_exception_signal_.get()), HSA_SIGNAL_CONDITION_NE, 0, + HwExceptionHandler, reinterpret_cast(hw_exception_signal_.get())); } bool Runtime::HwExceptionHandler(hsa_signal_value_t val, void* arg) { @@ -2262,7 +2263,8 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { } void Runtime::PrintMemoryMapNear(void* ptr) { - runtime_singleton_->memory_lock_.Acquire(); + std::unique_lock lock(runtime_singleton_->memory_lock_); + auto it = runtime_singleton_->allocation_map_.upper_bound(ptr); for (int i = 0; i < 2; i++) { if (it != runtime_singleton_->allocation_map_.begin()) it--; @@ -2287,8 +2289,9 @@ void Runtime::PrintMemoryMapNear(void* ptr) { it++; } fprintf(stderr, "\n"); - it = start; - runtime_singleton_->memory_lock_.Release(); + it = start; + lock.unlock(); + hsa_amd_pointer_info_t info = {}; PtrInfoBlockData block = {}; uint32_t count = 0; @@ -2408,7 +2411,7 @@ hsa_status_t Runtime::Load() { BindErrorHandlers(); - loader_ = amd::hsa::loader::Loader::Create(&loader_context_); + loader_.reset(amd::hsa::loader::Loader::Create(&loader_context_)); // Load extensions LoadExtensions(); @@ -2449,8 +2452,8 @@ void Runtime::Unload() { UnloadTools(); UnloadExtensions(); - amd::hsa::loader::Loader::Destroy(loader_); - loader_ = nullptr; + amd::hsa::loader::Loader::Destroy(loader_.get()); + loader_.reset(); for(auto nodeAgent: agents_by_node_) { for (auto agent: nodeAgent.second) @@ -2462,17 +2465,17 @@ void Runtime::Unload() { if (vm_fault_signal_ != nullptr) { vm_fault_signal_->DestroySignal(); - vm_fault_signal_ = nullptr; + vm_fault_signal_.reset(); } - core::InterruptSignal::DestroyEvent(vm_fault_event_); - vm_fault_event_ = nullptr; + + vm_fault_event_.reset(); if (hw_exception_signal_ != nullptr) { hw_exception_signal_->DestroySignal(); - hw_exception_signal_ = nullptr; + hw_exception_signal_.reset(); } - core::InterruptSignal::DestroyEvent(hw_exception_event_); - hw_exception_event_ = nullptr; + + hw_exception_event_.reset(); SharedSignalPool.clear(); @@ -2890,7 +2893,7 @@ void Runtime::AsyncEvents::Clear() { hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback, void* data) { - ScopedAcquire lock(&system_event_lock_); + std::lock_guard lock(system_event_lock_); system_event_handlers_.push_back( std::make_pair(AMD::callback_t(callback), data)); return HSA_STATUS_SUCCESS; @@ -2898,7 +2901,7 @@ hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_ std::vector, void*>> Runtime::GetSystemEventHandlers() { - ScopedAcquire lock(&system_event_lock_); + std::lock_guard lock(system_event_lock_); return system_event_handlers_; } @@ -3269,7 +3272,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, } { - ScopedAcquire lock(&prefetch_lock_); + std::lock_guard lock(prefetch_lock_); // Remove all fully overlapped and trim partially overlapped ranges. // Get iteration bounds auto start = prefetch_map_.upper_bound(base); @@ -3332,7 +3335,7 @@ hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, // Remove the prefetch's ranges from the map. static auto removePrefetchRanges = [](PrefetchOp* op) { - ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + std::lock_guard lock(Runtime::runtime_singleton_->prefetch_lock_); auto it = op->prefetch_map_entry; while (it != Runtime::runtime_singleton_->prefetch_map_.end()) { auto next = it->second.next; @@ -3389,7 +3392,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) { std::vector> holes; - ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + std::lock_guard lock(Runtime::runtime_singleton_->prefetch_lock_); auto start = prefetch_map_.upper_bound(base); if (start != prefetch_map_.begin()) start--; auto stop = prefetch_map_.lower_bound(end); @@ -3441,7 +3444,7 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) { hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, uint64_t* offset, uint64_t flags) { #ifdef __linux__ - ScopedAcquire lock(memory_lock_.shared()); + std::shared_lock lock(memory_lock_); // Lookup containing allocation. auto mem = allocation_map_.upper_bound(ptr); if (mem != allocation_map_.begin()) { @@ -3507,7 +3510,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add if (!alignment) alignment = rocr::os::PageSize(); - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); if (flags & HSA_AMD_VMEM_ADDRESS_NO_REGISTER) { size_t requested = size + alignment - rocr::os::PageSize(); @@ -3548,7 +3551,7 @@ hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t add } hsa_status_t Runtime::VMemoryAddressFree(void* va, size_t size) { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); std::map::iterator it = reserved_address_map_.find(va); if (it == reserved_address_map_.end()) { @@ -3580,7 +3583,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz if (!IsMultipleOf(size, memRegion->GetPageSize())) return HSA_STATUS_ERROR_INVALID_ARGUMENT; - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); ThunkHandle user_mode_driver_handle; hsa_status_t status = region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0); @@ -3597,7 +3600,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz } hsa_status_t Runtime::VMemoryHandleRelease(hsa_amd_vmem_alloc_handle_t memoryOnlyHandle) { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); auto memoryHandleIt = memory_handle_map_.find(MemoryHandle::Convert(memoryOnlyHandle)); if (memoryHandleIt == memory_handle_map_.end()) { @@ -3628,7 +3631,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset, uint64_t offset = 0, ret; uint64_t drm_cpu_addr = 0; - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); auto addressHandle = VMemoryFindReservedAddressHandle(va); if (addressHandle == nullptr || reinterpret_cast(va) + size > @@ -3703,7 +3706,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset, } hsa_status_t Runtime::VMemoryHandleUnmap(void* va, size_t size) { - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); std::list> mappedHandles; // va + size may consist of multiple MappedHandle's. @@ -3921,7 +3924,7 @@ hsa_status_t Runtime::VMemorySetAccess(void* va, size_t size, if (targetAgent == NULL || !targetAgent->IsValid()) return HSA_STATUS_ERROR_INVALID_AGENT; } - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); auto addressHandle = VMemoryFindReservedAddressHandle(va); if (addressHandle == nullptr || @@ -4014,7 +4017,7 @@ hsa_status_t Runtime::VMemoryGetAccess(const void* va, hsa_access_permission_t* *perms = HSA_ACCESS_PERMISSION_NONE; bool mappedHandleFound = false; - ScopedAcquire lock(&memory_lock_); + std::lock_guard lock(memory_lock_); auto mappedHandleIt = mapped_handle_map_.upper_bound(va); if (mappedHandleIt != mapped_handle_map_.begin()) { @@ -4076,8 +4079,8 @@ hsa_status_t Runtime::VMemoryImportShareableHandle(int dmabuf_fd, return; } - for (const core::MemoryRegion* region : agent->regions()) { - const AMD::MemoryRegion* amd_region = reinterpret_cast(region); + for (const auto& region : agent->regions()) { + const AMD::MemoryRegion* amd_region = reinterpret_cast(region.get()); // TODO: Verify that this works on a system with FINE_GRAINED memory. // System's with FINE_GRAINED will have both COARSE and FINE grain... need to get the diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp index 518b5b121c..2660ddd26c 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/signal.cpp @@ -58,7 +58,7 @@ namespace rocr { namespace core { -KernelMutex Signal::ipcLock_; +std::mutex Signal::ipcLock_; std::map Signal::ipcMap_; void SharedSignalPool_t::clear() { @@ -76,7 +76,7 @@ void SharedSignalPool_t::clear() { } SharedSignal* SharedSignalPool_t::alloc() { - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); if (free_list_.empty()) { SharedSignal* block = reinterpret_cast( allocate_()(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), core::MemoryRegion::AllocateNonPaged, 0)); @@ -109,7 +109,7 @@ void SharedSignalPool_t::free(SharedSignal* ptr) { if (ptr == nullptr) return; ptr->~SharedSignal(); - ScopedAcquire lock(&lock_); + std::lock_guard lock(lock_); ifdebug { bool valid = false; @@ -134,7 +134,7 @@ LocalSignal::LocalSignal(hsa_signal_value_t initial_value, bool exportable) } void Signal::registerIpc() { - ScopedAcquire lock(&ipcLock_); + std::lock_guard lock(ipcLock_); auto handle = Convert(this); assert(ipcMap_.find(handle.handle) == ipcMap_.end() && "Can't register the same IPC signal twice."); @@ -142,7 +142,7 @@ void Signal::registerIpc() { } bool Signal::deregisterIpc() { - ScopedAcquire lock(&ipcLock_); + std::lock_guard lock(ipcLock_); if (refcount_ != 0) return false; auto handle = Convert(this); const auto& it = ipcMap_.find(handle.handle); @@ -152,14 +152,14 @@ bool Signal::deregisterIpc() { } Signal* Signal::lookupIpc(hsa_signal_t signal) { - ScopedAcquire lock(&ipcLock_); + std::lock_guard lock(ipcLock_); const auto& it = ipcMap_.find(signal.handle); if (it == ipcMap_.end()) return nullptr; return it->second; } Signal* Signal::duplicateIpc(hsa_signal_t signal) { - ScopedAcquire lock(&ipcLock_); + std::lock_guard lock(ipcLock_); const auto& it = ipcMap_.find(signal.handle); if (it == ipcMap_.end()) return nullptr; it->second->refcount_++; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h index 3c20b88316..a36e29c2cc 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h @@ -125,16 +125,16 @@ template class lazy_ptr { private: mutable std::unique_ptr obj; mutable std::function func; - mutable KernelMutex lock; + mutable std::mutex lock; // Separated from make to improve inlining. void make_body(bool block) const { if (block) { - lock.Acquire(); - } else if (!lock.Try()) { + lock.lock(); + } else if (!lock.try_lock()) { return; } - MAKE_SCOPE_GUARD([&]() { lock.Release(); }); + MAKE_SCOPE_GUARD([&]() { lock.unlock(); }); if (func == nullptr) return; T* ptr = func(); obj.reset(ptr); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h index e7fa2f0b5e..133dd06f0b 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/locks.h @@ -90,6 +90,11 @@ class HybridMutex { os::PostSemaphore(sem_); } + // To add compatibility with std::lock_guard + void lock() { Acquire(); } + void unlock() { Release(); } + bool try_lock() { return Try(); } + private: std::atomic lock_; os::Semaphore sem_; @@ -100,27 +105,6 @@ class HybridMutex { DISALLOW_COPY_AND_ASSIGN(HybridMutex); }; - -/// @brief: a class represents a kernel mutex. -/// Uses the kernel's scheduler to keep the waiting thread from being scheduled -/// until the lock is released (Best for long waits, though anything using -/// a kernel object is a long wait). -class KernelMutex { - public: - KernelMutex() { lock_ = os::CreateMutex(); } - ~KernelMutex() { os::DestroyMutex(lock_); } - - bool Try() { return os::TryAcquireMutex(lock_); } - bool Acquire() { return os::AcquireMutex(lock_); } - void Release() { os::ReleaseMutex(lock_); } - - private: - os::Mutex lock_; - - /// @brief: Disable copiable and assignable ability. - DISALLOW_COPY_AND_ASSIGN(KernelMutex); -}; - /// @brief: represents a spin lock. /// For very short hold durations on the order of the thread scheduling /// quanta or less. @@ -143,6 +127,11 @@ class SpinMutex { } void Release() { lock_ = 0; } + // To add compatibility with std::lock_guard + void lock() { Acquire(); } + void unlock() { Release(); } + bool try_lock() { return Try(); } + private: std::atomic lock_; @@ -167,124 +156,6 @@ class KernelEvent { DISALLOW_COPY_AND_ASSIGN(KernelEvent); }; -/// @brief: represents a yielding shared mutex. -/// aka read/write mutex -class KernelSharedMutex { - public: - /// @brief: Interfaces ScopedAcquire to shared operations. - class Shared { - public: - explicit Shared(KernelSharedMutex* lock) : lock_(lock) {} - bool Try() { return lock_->TryShared(); } - bool Acquire() { return lock_->AcquireShared(); } - void Release() { lock_->ReleaseShared(); } - - private: - KernelSharedMutex* lock_; - }; - - KernelSharedMutex() { lock_ = os::CreateSharedMutex(); } - ~KernelSharedMutex() { os::DestroySharedMutex(lock_); } - - // Exclusive mode operations - bool Try() { return os::TryAcquireSharedMutex(lock_); } - bool Acquire() { return os::AcquireSharedMutex(lock_); } - void Release() { os::ReleaseSharedMutex(lock_); } - - // Shared mode operations - bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); } - bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); } - void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); } - - // Return shared operations interface - Shared shared() { return Shared(this); } - - private: - os::SharedMutex lock_; - - /// @brief: Disable copiable and assignable ability. - DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex); -}; - -/// @brief: Type trait to identify mutex types -template class isMutex { - public: - enum { value = false }; -}; -template <> class isMutex { - public: - enum { value = true }; -}; -template <> class isMutex { - public: - enum { value = true }; -}; -template <> class isMutex { - public: - enum { value = true }; -}; -template <> class isMutex { - public: - enum { value = true }; -}; - -/// @brief: A class behaves as a lock in a scope. When trying to enter into the -/// critical section, creat a object of this class. After the control path goes -/// out of the scope, it will release the lock automatically. -template class ScopedAcquire { - public: - /// @brief: When constructing, acquire the lock. - /// @param: lock(Input), pointer to an existing lock. - explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) { - static_assert(isMutex::value, "ScopedAcquire requires a mutex type."); - lock_.Acquire(); - } - explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) { - static_assert(!isMutex::value, "Mutex types are not copyable."); - lock_.Acquire(); - } - - /// @brief: when destructing, release the lock. - ~ScopedAcquire() { - if (doRelease) lock_.Release(); - } - - /// @brief: Release the lock early. Avoid using when possible. - void Release() { - lock_.Release(); - doRelease = false; - } - - private: - /// @brief: Adapts between pointers to mutex types and mutex pointer types. - template class container { - public: - container(T* lock) : lock_(lock) {} - __forceinline bool Acquire() { return lock_->Acquire(); } - __forceinline void Release() { return lock_->Release(); } - - private: - T* lock_; - }; - - /// @brief: Specialization for mutex pointer types. - template class container { - public: - container(T lock) : lock_(lock) {} - __forceinline bool Acquire() { return lock_.Acquire(); } - __forceinline void Release() { return lock_.Release(); } - - private: - T lock_; - }; - - container::value> lock_; - bool doRelease; - - /// @brief: Disable copiable and assignable ability. - DISALLOW_COPY_AND_ASSIGN(ScopedAcquire); -}; - } // namespace rocr #endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_ diff --git a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp index 7ce27689ba..500247b537 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp @@ -286,11 +286,6 @@ namespace code { } } - AmdHsaCode::~AmdHsaCode() - { - for (Symbol* sym : symbols) { delete sym; } - } - bool AmdHsaCode::PullElf() { uint32_t majorVersion, minorVersion; @@ -330,7 +325,7 @@ namespace code { } for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) { amd::elf::Symbol* elfsym = img->symtab()->symbol(i); - Symbol* sym = 0; + std::shared_ptr sym; switch (elfsym->type()) { case STT_AMDGPU_HSA_KERNEL: { amd::elf::Section* sec = elfsym->section(); @@ -347,12 +342,12 @@ namespace code { out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl; return false; } - sym = new KernelSymbol(elfsym, &akc); + sym = std::make_shared(elfsym, &akc); break; } case STT_OBJECT: case STT_COMMON: - sym = new VariableSymbol(elfsym); + sym = std::make_shared(elfsym); break; default: break; // Skip unknown symbols. @@ -924,9 +919,9 @@ namespace code { std::string(module_name ? module_name : ""), std::string(symbol_name) ); - for (Symbol* sym : symbols) { + for (const auto& sym : symbols) { if (sym->Name() == mname) { - *s = Symbol::ToHandle(sym); + *s = Symbol::ToHandle(sym.get()); return HSA_STATUS_SUCCESS; } } @@ -940,8 +935,8 @@ namespace code { void* data), void* data) { - for (Symbol* sym : symbols) { - hsa_code_symbol_t s = Symbol::ToHandle(sym); + for (const auto& sym : symbols) { + hsa_code_symbol_t s = Symbol::ToHandle(sym.get()); hsa_status_t status = callback(code_object, s, data); if (status != HSA_STATUS_SUCCESS) { return status; } } @@ -1144,8 +1139,8 @@ namespace code { { if (nullptr == img) { return nullptr; } if (!section) { section = HsaText(); } - symbols.push_back(new KernelSymbol(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr)); - return symbols.back(); + symbols.push_back(std::make_shared(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr)); + return symbols.back().get(); } Symbol* AmdHsaCode::AddVariableSymbol(const std::string &name, @@ -1157,8 +1152,8 @@ namespace code { uint64_t size) { if (nullptr == img) { return nullptr; } - symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(section, name, value, size, type, binding, other))); - return symbols.back(); + symbols.push_back(std::make_shared(img->symtab()->addSymbol(section, name, value, size, type, binding, other))); + return symbols.back().get(); } void AmdHsaCode::AddSectionSymbols() @@ -1166,16 +1161,16 @@ namespace code { if (nullptr == img) { return; } for (size_t i = 0; i < dataSections.size(); ++i) { if (dataSections[i] && dataSections[i]->flags() & SHF_ALLOC) { - symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL))); + symbols.push_back(std::make_shared(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL))); } } } Symbol* AmdHsaCode::GetSymbolByElfIndex(size_t index) { - for (auto &s : symbols) { + for (const auto &s : symbols) { if (s && index == s->Index()) { - return s; + return s.get(); } } return nullptr; @@ -1185,7 +1180,7 @@ namespace code { { for (auto &s : symbols) { if (s && n == s->Name()) { - return s; + return s.get(); } } return nullptr; @@ -1747,14 +1742,13 @@ namespace code { return false; } - AmdHsaCode* AmdHsaCodeManager::FromHandle(hsa_code_object_t c) + const std::shared_ptr& AmdHsaCodeManager::FromHandle(hsa_code_object_t c) { CodeMap::iterator i = codeMap.find(c.handle); if (i == codeMap.end()) { - AmdHsaCode* code = new AmdHsaCode(); + std::shared_ptr code = std::make_shared(); const void* buffer = reinterpret_cast(c.handle); if (!code->InitAsBuffer(buffer, 0)) { - delete code; return 0; } codeMap[c.handle] = code; @@ -1770,7 +1764,7 @@ namespace code { // Currently, we do not always create map entry for every code object buffer. return true; } - delete i->second; + i->second.reset(); codeMap.erase(i); return true; } @@ -1798,7 +1792,7 @@ namespace code { } for (size_t i = 0; i < img->getSymbolTable()->symbolCount(); ++i) { amd::elf::Symbol* elfsym = img->getSymbolTable()->symbol(i); - Symbol* sym = 0; + std::shared_ptr sym; switch (elfsym->type()) { case STT_AMDGPU_HSA_KERNEL: { amd::elf::Section* sec = elfsym->section(); @@ -1815,12 +1809,12 @@ namespace code { out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl; return false; } - sym = new KernelSymbolV2(elfsym, &akc); + sym = std::make_shared(elfsym, &akc); break; } case STT_OBJECT: case STT_COMMON: - sym = new VariableSymbolV2(elfsym); + sym = std::make_shared(elfsym); break; default: break; // Skip unknown symbols. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp index 51843f6128..772cd36722 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp @@ -186,7 +186,6 @@ void Loader::Destroy(Loader *loader) _amdgpu_r_debug.r_map = nullptr; _amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT; r_debug_tail() = nullptr; - delete loader; } Executable* AmdHsaCodeLoader::CreateExecutable( @@ -194,8 +193,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable( { WriterLockGuard writer_lock(rw_lock_); - executables.push_back(new ExecutableImpl(profile, context, executables.size(), default_float_rounding_mode)); - return executables.back(); + executables.push_back(std::make_shared(profile, context, executables.size(), default_float_rounding_mode)); + return executables.back().get(); } Executable* AmdHsaCodeLoader::CreateExecutable( @@ -206,8 +205,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable( { WriterLockGuard writer_lock(rw_lock_); - executables.push_back(new ExecutableImpl(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode)); - return executables.back(); + executables.push_back(std::make_shared(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode)); + return executables.back().get(); } static void AddCodeObjectInfoIntoDebugMap(link_map* map) { @@ -254,7 +253,7 @@ hsa_status_t AmdHsaCodeLoader::FreezeExecutable(Executable *executable, const ch atomic::Fence(std::memory_order_acq_rel); _loader_debug_state(); atomic::Fence(std::memory_order_acq_rel); - for (auto &lco : reinterpret_cast(executable)->loaded_code_objects) { + for (const auto &lco : reinterpret_cast(executable)->loaded_code_objects) { AddCodeObjectInfoIntoDebugMap(&(lco->r_debug_info)); } atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release); @@ -270,14 +269,13 @@ void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) { atomic::Fence(std::memory_order_acq_rel); _loader_debug_state(); atomic::Fence(std::memory_order_acq_rel); - for (auto &lco : reinterpret_cast(executable)->loaded_code_objects) { + for (const auto &lco : reinterpret_cast(executable)->loaded_code_objects) { RemoveCodeObjectInfoFromDebugMap(&(lco->r_debug_info)); } atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release); _loader_debug_state(); - executables[((ExecutableImpl*)executable)->id()] = nullptr; - delete executable; + executables[static_cast(executable)->id()].reset(); } hsa_status_t AmdHsaCodeLoader::IterateExecutables( @@ -289,9 +287,9 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables( WriterLockGuard writer_lock(rw_lock_); assert(callback); - for (auto &exec : executables) { + for (const auto &exec : executables) { if(exec != nullptr){ - hsa_status_t status = callback(Executable::Handle(exec), data); + hsa_status_t status = callback(Executable::Handle(exec.get()), data); if (status != HSA_STATUS_SUCCESS) { return status; } @@ -318,7 +316,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors( this->EnableReadOnlyMode(); size_t actual_num_segment_descriptors = 0; - for (auto &executable : executables) { + for (const auto &executable : executables) { if (executable) { actual_num_segment_descriptors += executable->GetNumSegmentDescriptors(); } @@ -335,7 +333,7 @@ hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors( } size_t i = 0; - for (auto &executable : executables) { + for (const auto &executable : executables) { if (executable) { i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i); } @@ -352,7 +350,7 @@ uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address) return 0; } - for (auto &exec : executables) { + for (const auto &exec : executables) { if (exec != nullptr) { uint64_t host_address = exec->FindHostAddress(device_address); if (host_address != 0) { @@ -371,9 +369,9 @@ void AmdHsaCodeLoader::PrintHelp(std::ostream& out) void AmdHsaCodeLoader::EnableReadOnlyMode() { rw_lock_.ReaderLock(); - for (auto &executable : executables) { + for (const auto &executable : executables) { if (executable) { - ((ExecutableImpl*)executable)->EnableReadOnlyMode(); + ((ExecutableImpl*)executable.get())->EnableReadOnlyMode(); } } } @@ -381,9 +379,9 @@ void AmdHsaCodeLoader::EnableReadOnlyMode() void AmdHsaCodeLoader::DisableReadOnlyMode() { rw_lock_.ReaderUnlock(); - for (auto &executable : executables) { + for (const auto &executable : executables) { if (executable) { - ((ExecutableImpl*)executable)->DisableReadOnlyMode(); + ((ExecutableImpl*)executable.get())->DisableReadOnlyMode(); } } } @@ -781,18 +779,10 @@ ExecutableImpl::ExecutableImpl( } ExecutableImpl::~ExecutableImpl() { - for (ExecutableObject* o : objects) { + for (const auto& o : objects) { o->Destroy(); - delete o; } objects.clear(); - - for (auto &symbol_entry : program_symbols_) { - delete symbol_entry.second; - } - for (auto &symbol_entry : agent_symbols_) { - delete symbol_entry.second; - } } hsa_status_t ExecutableImpl::DefineProgramExternalVariable( @@ -812,7 +802,7 @@ hsa_status_t ExecutableImpl::DefineProgramExternalVariable( program_symbols_.insert( std::make_pair(std::string(name), - new VariableSymbol(true, + std::make_shared(true, "", // Only program linkage symbols can be // defined. std::string(name), @@ -848,7 +838,7 @@ hsa_status_t ExecutableImpl::DefineAgentExternalVariable( auto insert_status = agent_symbols_.insert( std::make_pair(std::make_pair(std::string(name), agent), - new VariableSymbol(true, + std::make_shared(true, "", // Only program linkage symbols can be // defined. std::string(name), @@ -896,14 +886,14 @@ Symbol* ExecutableImpl::GetSymbolInternal( if (!agent) { auto program_symbol = program_symbols_.find(mangled_name); if (program_symbol != program_symbols_.end()) { - return program_symbol->second; + return program_symbol->second.get(); } return nullptr; } auto agent_symbol = agent_symbols_.find(std::make_pair(mangled_name, *agent)); if (agent_symbol != agent_symbols_.end()) { - return agent_symbol->second; + return agent_symbol->second.get(); } return nullptr; } @@ -916,14 +906,14 @@ hsa_status_t ExecutableImpl::IterateSymbols( for (auto &symbol_entry : program_symbols_) { hsa_status_t hsc = - callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data); if (HSA_STATUS_SUCCESS != hsc) { return hsc; } } for (auto &symbol_entry : agent_symbols_) { hsa_status_t hsc = - callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data); if (HSA_STATUS_SUCCESS != hsc) { return hsc; } @@ -948,7 +938,7 @@ hsa_status_t ExecutableImpl::IterateAgentSymbols( } hsa_status_t status = callback( - Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second), + Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second.get()), data); if (status != HSA_STATUS_SUCCESS) { return status; @@ -968,7 +958,7 @@ hsa_status_t ExecutableImpl::IterateProgramSymbols( for (auto &symbol_entry : program_symbols_) { hsa_status_t status = callback( - Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + Executable::Handle(this), Symbol::Handle(symbol_entry.second.get()), data); if (status != HSA_STATUS_SUCCESS) { return status; } @@ -987,10 +977,10 @@ hsa_status_t ExecutableImpl::IterateLoadedCodeObjects( ReaderLockGuard reader_lock(rw_lock_); assert(callback); - for (auto &loaded_code_object : loaded_code_objects) { + for (const auto& loaded_code_object : loaded_code_objects) { hsa_status_t status = callback( Executable::Handle(this), - LoadedCodeObject::Handle(loaded_code_object), + LoadedCodeObject::Handle(loaded_code_object.get()), data); if (status != HSA_STATUS_SUCCESS) { return status; @@ -1004,7 +994,7 @@ size_t ExecutableImpl::GetNumSegmentDescriptors() { // assuming we are in readonly mode. size_t actual_num_segment_descriptors = 0; - for (auto &obj : loaded_code_objects) { + for (const auto &obj : loaded_code_objects) { actual_num_segment_descriptors += obj->LoadedSegments().size(); } return actual_num_segment_descriptors; @@ -1020,7 +1010,7 @@ size_t ExecutableImpl::QuerySegmentDescriptors( assert(first_empty_segment_descriptor < total_num_segment_descriptors); size_t i = first_empty_segment_descriptor; - for (auto &obj : loaded_code_objects) { + for (const auto &obj : loaded_code_objects) { assert(i < total_num_segment_descriptors); for (auto &seg : obj->LoadedSegments()) { segment_descriptors[i].agent = seg->Agent(); @@ -1084,11 +1074,11 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address) return execHandle; } - for (auto &exec : executables) { + for (const auto &exec : executables) { if (exec != nullptr) { uint64_t host_address = exec->FindHostAddress(device_address); if (host_address != 0) { - return Executable::Handle(exec); + return Executable::Handle(exec.get()); } } } @@ -1098,7 +1088,7 @@ hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address) uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address) { ReaderLockGuard reader_lock(rw_lock_); - for (auto &obj : loaded_code_objects) { + for (const auto &obj : loaded_code_objects) { assert(obj); for (auto &seg : obj->LoadedSegments()) { assert(seg); @@ -1224,7 +1214,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject( uint32_t codeNum = NextCodeObjectNum(); - code.reset(new code::AmdHsaCode()); + code = std::make_unique(); std::string substituteFileName; for (const Substitute& ss : substitutes) { @@ -1306,8 +1296,8 @@ hsa_status_t ExecutableImpl::LoadCodeObject( hsa_status_t status; - objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize())); - loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back()); + objects.push_back(std::make_shared(this, agent, code->ElfData(), code->ElfSize())); + loaded_code_objects.push_back(std::static_pointer_cast(objects.back())); status = LoadSegments(agent, code.get(), majorVersion); if (status != HSA_STATUS_SUCCESS) return status; @@ -1338,7 +1328,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject( loaded_code_objects.back()->r_debug_info.l_prev = nullptr; loaded_code_objects.back()->r_debug_info.l_next = nullptr; - if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back()); } + if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back().get()); } return HSA_STATUS_SUCCESS; } @@ -1376,18 +1366,18 @@ hsa_status_t ExecutableImpl::LoadSegmentsV2(hsa_agent_t agent, AMD_ISA_ALIGN_BYTES, true); if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - Segment *load_segment = new Segment(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT, + std::shared_ptr load_segment = std::make_shared(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT, ptr, size, vaddr, c->DataSegment(0)->offset()); if (!load_segment) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; hsa_status_t status = HSA_STATUS_SUCCESS; for (size_t i = 0; i < c->DataSegmentCount(); ++i) { - status = LoadSegmentV2(c->DataSegment(i), load_segment); + status = LoadSegmentV2(c->DataSegment(i), load_segment.get()); if (status != HSA_STATUS_SUCCESS) return status; } objects.push_back(load_segment); - loaded_code_objects.back()->LoadedSegments().push_back(load_segment); + loaded_code_objects.back()->LoadedSegments().push_back(load_segment.get()); return HSA_STATUS_SUCCESS; } @@ -1398,7 +1388,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, if (s->memSize() == 0) return HSA_STATUS_SUCCESS; amdgpu_hsa_elf_segment_t segment = (amdgpu_hsa_elf_segment_t)(s->type() - PT_LOOS); - Segment *new_seg = nullptr; + std::shared_ptr new_seg; bool need_alloc = true; if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM && nullptr != program_allocation_segment) { new_seg = program_allocation_segment; @@ -1407,7 +1397,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, if (need_alloc) { void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true); if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset()); + new_seg = std::make_shared(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset()); new_seg->Copy(s->vaddr(), s->data(), s->imageSize()); objects.push_back(new_seg); @@ -1416,7 +1406,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, } } assert(new_seg); - loaded_code_objects.back()->LoadedSegments().push_back(new_seg); + loaded_code_objects.back()->LoadedSegments().push_back(new_seg.get()); return HSA_STATUS_SUCCESS; } @@ -1471,7 +1461,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, } uint64_t address = SymbolAddress(agent, sym); - SymbolImpl *symbol = nullptr; + std::shared_ptr symbol; if (string_ends_with(sym->GetSymbolName(), ".kd")) { // V3. llvm::amdhsa::kernel_descriptor_t kd; @@ -1486,7 +1476,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, uint64_t size = sym->Size(); - KernelSymbol *kernel_symbol = new KernelSymbol(true, + std::shared_ptr kernel_symbol = std::make_shared(true, sym->GetModuleName(), sym->GetSymbolName(), sym->Linkage(), @@ -1502,7 +1492,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, address); symbol = kernel_symbol; } else if (sym->IsVariableSymbol()) { - symbol = new VariableSymbol(true, + symbol = std::make_shared(true, sym->GetModuleName(), sym->GetSymbolName(), sym->Linkage(), @@ -1537,7 +1527,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, // calculate end of segment - symbol value. size = sym->GetSection()->size() - sym->SectionOffset(); } - KernelSymbol *kernel_symbol = new KernelSymbol(true, + std::shared_ptr kernel_symbol = std::make_shared(true, sym->GetModuleName(), sym->GetSymbolName(), sym->Linkage(), @@ -1970,7 +1960,7 @@ void ExecutableImpl::Print(std::ostream& out) << std::endl << std::endl; out << "Loaded Objects (total " << objects.size() << ")" << std::endl; size_t i = 0; - for (ExecutableObject* o : objects) { + for (const auto& o : objects) { out << "Loaded Object " << i++ << ": "; o->Print(out); out << std::endl; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp index 2cd9bdb9d7..9d8a238fb1 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp @@ -461,7 +461,7 @@ public: }; typedef std::string ProgramSymbol; -typedef std::unordered_map ProgramSymbolMap; +typedef std::unordered_map> ProgramSymbolMap; typedef std::pair AgentSymbol; struct ASC { @@ -476,7 +476,7 @@ struct ASH { return h ^ (i << 1); } }; -typedef std::unordered_map AgentSymbolMap; +typedef std::unordered_map, ASH, ASC> AgentSymbolMap; class ExecutableImpl final: public Executable { friend class AmdHsaCodeLoader; @@ -634,15 +634,15 @@ private: ProgramSymbolMap program_symbols_; AgentSymbolMap agent_symbols_; - std::vector objects; - Segment *program_allocation_segment; - std::vector loaded_code_objects; + std::vector> objects; + std::shared_ptr program_allocation_segment; + std::vector> loaded_code_objects; }; class AmdHsaCodeLoader : public Loader { private: Context* context; - std::vector executables; + std::vector> executables; amd::hsa::common::ReaderWriterLock rw_lock_; public: diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp index 8931aa9f92..7eb0621d65 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp @@ -282,7 +282,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal( size_t interval, size_t latency, size_t buffer_size, hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data, hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) { - ScopedAcquire lock(&pc_sampling_lock_); + std::lock_guard lock(pc_sampling_lock_); handle->handle = ++pc_sampling_id_; // create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size, @@ -305,7 +305,7 @@ hsa_status_t PcsRuntime::PcSamplingCreateInternal( } hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) { - ScopedAcquire lock(&pc_sampling_lock_); + std::lock_guard lock(pc_sampling_lock_); auto pcSamplingSessionIt = pc_sampling_.find(static_cast(handle.handle)); if (pcSamplingSessionIt == pc_sampling_.end()) { debug_warning(false && "Cannot find PcSampling session"); @@ -319,7 +319,7 @@ hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) { } hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) { - ScopedAcquire lock(&pc_sampling_lock_); + std::lock_guard lock(pc_sampling_lock_); auto pcSamplingSessionIt = pc_sampling_.find(static_cast(handle.handle)); if (pcSamplingSessionIt == pc_sampling_.end()) { debug_warning(false && "Cannot find PcSampling session"); @@ -331,7 +331,7 @@ hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) { } hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) { - ScopedAcquire lock(&pc_sampling_lock_); + std::lock_guard lock(pc_sampling_lock_); auto pcSamplingSessionIt = pc_sampling_.find(static_cast(handle.handle)); if (pcSamplingSessionIt == pc_sampling_.end()) { debug_warning(false && "Cannot find PcSampling session"); @@ -343,7 +343,7 @@ hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) { } hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) { - ScopedAcquire lock(&pc_sampling_lock_); + std::lock_guard lock(pc_sampling_lock_); auto pcSamplingSessionIt = pc_sampling_.find(static_cast(handle.handle)); if (pcSamplingSessionIt == pc_sampling_.end()) { debug_warning(false && "Cannot find PcSampling session"); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h index 3547693e8d..72a5ad4480 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h @@ -166,7 +166,7 @@ class PcsRuntime { } // Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle std::map pc_sampling_; - KernelMutex pc_sampling_lock_; + std::mutex pc_sampling_lock_; uint64_t pc_sampling_id_; DISALLOW_COPY_AND_ASSIGN(PcsRuntime);