diff --git a/runtime/hsa-runtime/core/common/shared.cpp b/runtime/hsa-runtime/core/common/shared.cpp index 19ae5b2632..6974c27694 100644 --- a/runtime/hsa-runtime/core/common/shared.cpp +++ b/runtime/hsa-runtime/core/common/shared.cpp @@ -44,7 +44,6 @@ namespace rocr { namespace core { -std::function BaseShared::allocate_ = nullptr; -std::function BaseShared::free_ = nullptr; + } // namespace core } // namespace rocr diff --git a/runtime/hsa-runtime/core/common/shared.h b/runtime/hsa-runtime/core/common/shared.h index 592295c529..b165093409 100644 --- a/runtime/hsa-runtime/core/common/shared.h +++ b/runtime/hsa-runtime/core/common/shared.h @@ -58,25 +58,36 @@ namespace core { class BaseShared { public: static void SetAllocateAndFree( - const std::function& allocate, - const std::function& free) { - allocate_ = allocate; - free_ = free; + const std::function& alloc, + const std::function& fr) { + allocate_() = alloc; + free_() = fr; } protected: - static std::function allocate_; - static std::function free_; + static __forceinline std::function& + allocate_() { + static std::function alloc = + nullptr; + return alloc; + } + static __forceinline std::function& + free_() { + static std::function fr = nullptr; + return fr; + } + }; /// @brief Default Allocator for Shared. Ensures allocations are whole pages. template class PageAllocator : private BaseShared { public: __forceinline static T* alloc(int flags = 0) { - T* ret = reinterpret_cast(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, 0)); + T* ret = reinterpret_cast( + allocate_()(AlignUp(sizeof(T), 4096), 4096, flags, 0)); if (ret == nullptr) throw std::bad_alloc(); - MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); }); + MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_()(ret); }); new (ret) T; @@ -85,10 +96,11 @@ template class PageAllocator : private BaseShared { } __forceinline static T* alloc(int agent_node_id, int flags) { - T* ret = reinterpret_cast(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id)); + T* ret = reinterpret_cast( + allocate_()(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id)); if (ret == nullptr) throw std::bad_alloc(); - MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); }); + MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_()(ret); }); new (ret) T; @@ -99,7 +111,7 @@ template class PageAllocator : private BaseShared { __forceinline static void free(T* ptr) { if (ptr != nullptr) { ptr->~T(); - free_(ptr); + free_()(ptr); } } }; @@ -110,7 +122,7 @@ template > class Shared final : private BaseShared { public: explicit Shared(Allocator* pool = nullptr, int flags = 0) : pool_(pool) { - assert(allocate_ != nullptr && free_ != nullptr && + assert(allocate_() != nullptr && free_() != nullptr && "Shared object allocator is not set"); if (pool_) @@ -120,7 +132,7 @@ class Shared final : private BaseShared { } explicit Shared(int agent_node_id, Allocator* pool = nullptr, int flags = 0) : pool_(pool) { - assert(allocate_ != nullptr && free_ != nullptr && + assert(allocate_() != nullptr && free_() != nullptr && "Shared object allocator is not set"); if (pool_) @@ -130,7 +142,8 @@ class Shared final : private BaseShared { } ~Shared() { - assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); + assert(allocate_() != nullptr && free_() != nullptr && + "Shared object allocator is not set"); if (pool_) pool_->free(shared_object_); @@ -164,19 +177,20 @@ class Shared final : private BaseShared { template class Shared> final : private BaseShared { public: Shared(int flags = 0) { - assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); + assert(allocate_() != nullptr && free_() != nullptr && + "Shared object allocator is not set"); shared_object_ = PageAllocator::alloc(flags); } Shared(int agent_node_id, int flags) { - assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); + assert(allocate_() != nullptr && free_() != nullptr && "Shared object allocator is not set"); shared_object_ = PageAllocator::alloc(agent_node_id, flags); } ~Shared() { - assert(allocate_ != nullptr && free_ != nullptr && + assert(allocate_() != nullptr && free_() != nullptr && "Shared object allocator is not set"); PageAllocator::free(shared_object_); @@ -207,18 +221,19 @@ template class SharedArray final : private BaseShared SharedArray() : shared_object_(nullptr) {} explicit SharedArray(size_t length) : shared_object_(nullptr), len(length) { - assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); + assert(allocate_() != nullptr && free_() != nullptr && + "Shared object allocator is not set"); static_assert((__alignof(T) <= Align) || (Align == 0), "Align is less than alignof(T)"); shared_object_ = - reinterpret_cast(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0, 0)); + reinterpret_cast(allocate_()(sizeof(T) * length, Max(__alignof(T), Align), 0, 0)); if (shared_object_ == nullptr) throw std::bad_alloc(); size_t i = 0; MAKE_NAMED_SCOPE_GUARD(loopGuard, [&]() { for (size_t t = 0; t < i - 1; t++) shared_object_[t].~T(); - free_(shared_object_); + free_()(shared_object_); }); for (; i < length; i++) new (&shared_object_[i]) T; @@ -227,11 +242,12 @@ template class SharedArray final : private BaseShared } ~SharedArray() { - assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); + assert(allocate_() != nullptr && free_() != nullptr && + "Shared object allocator is not set"); if (shared_object_ != nullptr) { for (size_t i = 0; i < len; i++) shared_object_[i].~T(); - free_(shared_object_); + free_()(shared_object_); } } diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index da0bfb43ce..2c449c18d2 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -62,11 +62,11 @@ class AieAqlQueue : public core::Queue, core::DoorbellSignal { public: static __forceinline bool IsType(core::Signal *signal) { - return signal->IsType(&rtti_id_); + return signal->IsType(&rtti_id()); } static __forceinline bool IsType(core::Queue *queue) { - return queue->IsType(&rtti_id_); + return queue->IsType(&rtti_id()); } AieAqlQueue() = delete; @@ -123,7 +123,7 @@ public: uint32_t queue_size_bytes_ = std::numeric_limits::max(); protected: - bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; } + bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id(); } private: AieAgent &agent_; @@ -144,7 +144,11 @@ private: /// @brief Indicates if queue is active. std::atomic active_; - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } + }; } // namespace AMD diff --git a/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aql_queue.h index 09f14f9410..c446c0d872 100644 --- a/runtime/hsa-runtime/core/inc/amd_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -57,10 +57,10 @@ namespace AMD { class AqlQueue : public core::Queue, private core::LocalSignal, public core::DoorbellSignal { public: static __forceinline bool IsType(core::Signal* signal) { - return signal->IsType(&rtti_id_); + return signal->IsType(&rtti_id()); } - static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id_); } + static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id()); } // Acquires/releases queue resources and requests HW schedule/deschedule. AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, @@ -228,7 +228,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo void AsyncReclaimAltScratch(); protected: - bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; } + bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id(); } private: uint32_t ComputeRingBufferMinPkts(); @@ -331,18 +331,28 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo std::vector cu_mask_; // Shared event used for queue errors - static HsaEvent* queue_event_; - + static __forceinline HsaEvent*& queue_event() { + static HsaEvent* queue_event_ = nullptr; + return queue_event_; + } // Queue count - used to ref count queue_event_ - static std::atomic queue_count_; + static __forceinline std::atomic& queue_count() { + static std::atomic queue_count_(0); + return queue_count_; + } // Mutex for queue_event_ manipulation - static KernelMutex queue_lock_; - + static __forceinline KernelMutex& queue_lock() { + static KernelMutex queue_lock_; + return queue_lock_; +} // Async scratch single limit - may be modified after init size_t async_scratch_single_limit_; - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } // Forbid copying and moving of this object DISALLOW_COPY_AND_ASSIGN(AqlQueue); diff --git a/runtime/hsa-runtime/core/inc/amd_blit_kernel.h b/runtime/hsa-runtime/core/inc/amd_blit_kernel.h index 8c45a1b2a2..077b9782b1 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_kernel.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_kernel.h @@ -117,6 +117,7 @@ class BlitKernel : public core::Blit { virtual void GangLeader(bool gang_leader) override {} virtual bool GangLeader() const override { return false; } + const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID; private: union KernelArgs { struct __ALIGNED__(16) { diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index bb6b76daef..1ac0cb7855 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -167,7 +167,7 @@ class MemoryRegion : public core::MemoryRegion { return static_cast(mem_props_.MemoryClockMax); } - __forceinline size_t GetPageSize() const { return kPageSize_; } + __forceinline size_t GetPageSize() const { return kPageSize(); } __forceinline const HsaMemFlags &mem_flags() const { return mem_flag_; } __forceinline const HsaMemMapFlags &map_flags() const { return map_flag_; } @@ -195,7 +195,10 @@ private: // fragments of the block routing to the same MemoryRegion. mutable KernelMutex access_lock_; - static size_t kPageSize_; + static __forceinline const size_t& kPageSize() { + static size_t kPageSize_ = sysconf(_SC_PAGESIZE); + return kPageSize_; + } // Determine access type allowed to requesting device hsa_amd_memory_pool_access_t GetAccessInfo(const core::Agent& agent, diff --git a/runtime/hsa-runtime/core/inc/default_signal.h b/runtime/hsa-runtime/core/inc/default_signal.h index 0dd1ba26b6..7535fd7d12 100644 --- a/runtime/hsa-runtime/core/inc/default_signal.h +++ b/runtime/hsa-runtime/core/inc/default_signal.h @@ -59,7 +59,7 @@ class BusyWaitSignal : public Signal { /// @brief Determines if a Signal* can be safely converted to BusyWaitSignal* /// via static_cast. static __forceinline bool IsType(Signal* ptr) { - return ptr->IsType(&rtti_id_); + return ptr->IsType(&rtti_id()); } /// @brief See base class Signal. @@ -154,10 +154,13 @@ class BusyWaitSignal : public Signal { __forceinline HsaEvent* EopEvent() { return NULL; } protected: - bool _IsA(rtti_t id) const { return id == &rtti_id_; } + bool _IsA(rtti_t id) const { return id == &rtti_id(); } private: - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } DISALLOW_COPY_AND_ASSIGN(BusyWaitSignal); }; @@ -167,7 +170,7 @@ class DefaultSignal : private LocalSignal, public BusyWaitSignal { public: /// @brief Determines if a Signal* can be safely converted to BusyWaitSignal* /// via static_cast. - static __forceinline bool IsType(Signal* ptr) { return ptr->IsType(&rtti_id_); } + static __forceinline bool IsType(Signal* ptr) { return ptr->IsType(&rtti_id()); } /// @brief See base class Signal. explicit DefaultSignal(hsa_signal_value_t initial_value, bool enableIPC = false) @@ -175,12 +178,15 @@ class DefaultSignal : private LocalSignal, public BusyWaitSignal { protected: bool _IsA(rtti_t id) const { - if (id == &rtti_id_) return true; + if (id == &rtti_id()) return true; return BusyWaitSignal::_IsA(id); } private: - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } DISALLOW_COPY_AND_ASSIGN(DefaultSignal); }; diff --git a/runtime/hsa-runtime/core/inc/host_queue.h b/runtime/hsa-runtime/core/inc/host_queue.h index ce0bfbbcc1..bf052026ce 100644 --- a/runtime/hsa-runtime/core/inc/host_queue.h +++ b/runtime/hsa-runtime/core/inc/host_queue.h @@ -52,7 +52,7 @@ namespace rocr { namespace core { class HostQueue : public Queue { public: - static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id_); } + static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id()); } HostQueue(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, uint32_t features, hsa_signal_t doorbell_signal); @@ -175,17 +175,23 @@ class HostQueue : public Queue { void operator delete(void*, void*) {} protected: - bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; } + bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id(); } private: - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } static const size_t kRingAlignment = 256; const uint32_t size_; void* ring_; // Host queue id counter, starting from 0x80000000 to avoid overlaping // with aql queue id. - static std::atomic queue_count_; + static __forceinline std::atomic& queue_count() { + static std::atomic queue_count_; + return queue_count_; + } DISALLOW_COPY_AND_ASSIGN(HostQueue); }; diff --git a/runtime/hsa-runtime/core/inc/hsa_amd_tool_int.hpp b/runtime/hsa-runtime/core/inc/hsa_amd_tool_int.hpp index d0a0248376..566c01204f 100644 --- a/runtime/hsa-runtime/core/inc/hsa_amd_tool_int.hpp +++ b/runtime/hsa-runtime/core/inc/hsa_amd_tool_int.hpp @@ -32,7 +32,7 @@ __forceinline void notify_event_scratch_async_reclaim_end(const hsa_queue_t* que __forceinline void notify_event_scratch_alloc_start(const hsa_queue_t* queue, scratch_alloc_flag flags, uint64_t dispatch_id) { - const auto& tool_table = core::hsa_api_table_.tools_api; + const auto& tool_table = core::hsa_api_table().tools_api; if (!tool_table.hsa_amd_tool_scratch_event_alloc_start_fn) { return; } @@ -49,7 +49,7 @@ __forceinline void notify_event_scratch_alloc_start(const hsa_queue_t* queue, __forceinline void notify_event_scratch_alloc_end(const hsa_queue_t* queue, scratch_alloc_flag flags, uint64_t dispatch_id, size_t size, size_t num_slots) { - const auto& tool_table = core::hsa_api_table_.tools_api; + const auto& tool_table = core::hsa_api_table().tools_api; if (!tool_table.hsa_amd_tool_scratch_event_alloc_end_fn) { return; } @@ -69,7 +69,7 @@ __forceinline void notify_event_scratch_alloc_end(const hsa_queue_t* queue, __forceinline void notify_event_scratch_free_start(const hsa_queue_t* queue, scratch_alloc_flag flags) { - const auto& tool_table = core::hsa_api_table_.tools_api; + const auto& tool_table = core::hsa_api_table().tools_api; if (!tool_table.hsa_amd_tool_scratch_event_free_start_fn) { return; } @@ -86,7 +86,7 @@ __forceinline void notify_event_scratch_free_start(const hsa_queue_t* queue, __forceinline void notify_event_scratch_free_end(const hsa_queue_t* queue, scratch_alloc_flag flags) { - const auto& tool_table = core::hsa_api_table_.tools_api; + const auto& tool_table = core::hsa_api_table().tools_api; if (!tool_table.hsa_amd_tool_scratch_event_free_end_fn) { return; } @@ -103,7 +103,7 @@ __forceinline void notify_event_scratch_free_end(const hsa_queue_t* queue, __forceinline void notify_event_scratch_async_reclaim_start(const hsa_queue_t* queue, scratch_alloc_flag flags) { - const auto& tool_table = core::hsa_api_table_.tools_api; + const auto& tool_table = core::hsa_api_table().tools_api; if (!tool_table.hsa_amd_tool_scratch_event_async_reclaim_start_fn) { return; } @@ -120,7 +120,7 @@ __forceinline void notify_event_scratch_async_reclaim_start(const hsa_queue_t* q __forceinline void notify_event_scratch_async_reclaim_end(const hsa_queue_t* queue, scratch_alloc_flag flags) { - const auto& tool_table = core::hsa_api_table_.tools_api; + const auto& tool_table = core::hsa_api_table().tools_api; if (!tool_table.hsa_amd_tool_scratch_event_async_reclaim_end_fn) { return; } diff --git a/runtime/hsa-runtime/core/inc/hsa_api_trace_int.h b/runtime/hsa-runtime/core/inc/hsa_api_trace_int.h index e61e8a5dbd..08cb555394 100644 --- a/runtime/hsa-runtime/core/inc/hsa_api_trace_int.h +++ b/runtime/hsa-runtime/core/inc/hsa_api_trace_int.h @@ -73,8 +73,8 @@ namespace core { void Reset(); }; - extern HsaApiTable hsa_api_table_; - extern HsaApiTable hsa_internal_api_table_; + extern HsaApiTable& hsa_api_table(); + extern HsaApiTable& hsa_internal_api_table(); void LoadInitialHsaApiTable(); } // namespace core diff --git a/runtime/hsa-runtime/core/inc/hsa_table_interface.h b/runtime/hsa-runtime/core/inc/hsa_table_interface.h index 5cecbd4c02..e2885ec990 100644 --- a/runtime/hsa-runtime/core/inc/hsa_table_interface.h +++ b/runtime/hsa-runtime/core/inc/hsa_table_interface.h @@ -40,8 +40,13 @@ // //////////////////////////////////////////////////////////////////////////////// +#ifndef RUNTIME_HSA_RUNTIME_CORE_INC_HSA_TABLE_INTERFACE_H_ +#define RUNTIME_HSA_RUNTIME_CORE_INC_HSA_TABLE_INTERFACE_H_ + #include "inc/hsa_api_trace.h" void hsa_table_interface_init(const HsaApiTable* apiTable); const HsaApiTable* hsa_table_interface_get_table(); + +#endif // RUNTIME_HSA_RUNTIME_CORE_INC_HSA_TABLE_INTERFACE_H_ \ No newline at end of file diff --git a/runtime/hsa-runtime/core/inc/intercept_queue.h b/runtime/hsa-runtime/core/inc/intercept_queue.h index 8088d5e922..80915a9a8a 100644 --- a/runtime/hsa-runtime/core/inc/intercept_queue.h +++ b/runtime/hsa-runtime/core/inc/intercept_queue.h @@ -272,14 +272,17 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi /// @brief Provide information about the queue hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) override; - static __forceinline bool IsType(core::Signal* signal) { return signal->IsType(&rtti_id_); } - static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id_); } + static __forceinline bool IsType(core::Signal* signal) { return signal->IsType(&rtti_id()); } + static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id()); } protected: - bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; } + bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id(); } private: - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } }; } // namespace core diff --git a/runtime/hsa-runtime/core/inc/interrupt_signal.h b/runtime/hsa-runtime/core/inc/interrupt_signal.h index 1652c1b751..a774b095d9 100644 --- a/runtime/hsa-runtime/core/inc/interrupt_signal.h +++ b/runtime/hsa-runtime/core/inc/interrupt_signal.h @@ -94,7 +94,7 @@ class InterruptSignal : private LocalSignal, public Signal { /// @brief Determines if a Signal* can be safely converted to an /// InterruptSignal* via static_cast. static __forceinline bool IsType(Signal* ptr) { - return ptr->IsType(&rtti_id_); + return ptr->IsType(&rtti_id()); } explicit InterruptSignal(hsa_signal_value_t initial_value, @@ -191,7 +191,7 @@ class InterruptSignal : private LocalSignal, public Signal { __forceinline HsaEvent* EopEvent() { return event_; } protected: - bool _IsA(rtti_t id) const { return id == &rtti_id_; } + bool _IsA(rtti_t id) const { return id == &rtti_id(); } private: /// @variable KFD event on which the interrupt signal is based on. @@ -202,7 +202,10 @@ class InterruptSignal : private LocalSignal, public Signal { bool free_event_; /// Used to obtain a globally unique value (address) for rtti. - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } /// @brief Notify driver of signal value change if necessary. __forceinline void SetEvent() { diff --git a/runtime/hsa-runtime/core/inc/ipc_signal.h b/runtime/hsa-runtime/core/inc/ipc_signal.h index 87858ef18f..0d2e0ae445 100644 --- a/runtime/hsa-runtime/core/inc/ipc_signal.h +++ b/runtime/hsa-runtime/core/inc/ipc_signal.h @@ -90,16 +90,19 @@ class IPCSignal : private SharedMemorySignal, public BusyWaitSignal { /// @brief Determines if a Signal* can be safely converted to BusyWaitSignal* /// via static_cast. - static __forceinline bool IsType(Signal* ptr) { return ptr->IsType(&rtti_id_); } + static __forceinline bool IsType(Signal* ptr) { return ptr->IsType(&rtti_id()); } protected: bool _IsA(rtti_t id) const { - if (id == &rtti_id_) return true; + if (id == &rtti_id()) return true; return BusyWaitSignal::_IsA(id); } private: - static int rtti_id_; + static __forceinline int& rtti_id() { + static int rtti_id_ = 0; + return rtti_id_; + } static KernelMutex lock_; explicit IPCSignal(SharedMemorySignal&& abi_block) diff --git a/runtime/hsa-runtime/core/inc/isa.h b/runtime/hsa-runtime/core/inc/isa.h index ef182ee9d9..a04c86e19f 100644 --- a/runtime/hsa-runtime/core/inc/isa.h +++ b/runtime/hsa-runtime/core/inc/isa.h @@ -101,7 +101,7 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> { typedef std::tuple Version; /// @brief Default destructor. - ~Isa() {} + ~Isa() = default; /// @returns Handle equivalent of @p isa_object. static hsa_isa_t Handle(const Isa *isa_object) { @@ -228,17 +228,14 @@ class IsaRegistry final { /// @brief IsaRegistry's map type. typedef std::unordered_map IsaMap; - /// @brief Supported instruction set architectures. - static const IsaMap supported_isas_; + /// @brief Default constructor + IsaRegistry() = delete; - /// @brief Default constructor - not available. - IsaRegistry(); - - /// @brief Default destructor - not available. - ~IsaRegistry(); + /// @brief Default destructor + ~IsaRegistry() = default; /// @returns Supported instruction set architectures. - static const IsaMap GetSupportedIsas(); + static const IsaMap& GetSupportedIsas(); }; // class IsaRegistry } // namespace core diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index 137ec255bd..d8541de496 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -583,8 +583,10 @@ class Runtime { // Will be created before any user could call hsa_init but also could be // destroyed before incorrectly written programs call hsa_shutdown. - static KernelMutex bootstrap_lock_; - + static __forceinline KernelMutex& bootstrap_lock() { + static KernelMutex bootstrap_lock_; + return bootstrap_lock_; + } Runtime(); Runtime(const Runtime&); diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 9459909d39..ac1827055f 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -66,8 +66,6 @@ namespace rocr { namespace AMD { -int AieAqlQueue::rtti_id_ = 0; - AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id) : Queue(0, 0), LocalSignal(0, false), DoorbellSignal(signal()), diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 0a55dc9aad..f6a08c337d 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -73,11 +73,6 @@ namespace rocr { namespace AMD { -HsaEvent* AqlQueue::queue_event_ = nullptr; -std::atomic AqlQueue::queue_count_(0); -KernelMutex AqlQueue::queue_lock_; -int AqlQueue::rtti_id_ = 0; - AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch, core::HsaEventCallback callback, void* err_data, bool is_kv) : Queue(agent->node_id(), agent->isMES() ? (MemoryRegion::AllocateGTTAccess | MemoryRegion::AllocateNonPaged) : 0), @@ -236,11 +231,11 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr } MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() { - ScopedAcquire _lock(&queue_lock_); - queue_count_--; - if (queue_count_ == 0) { - core::InterruptSignal::DestroyEvent(queue_event_); - queue_event_ = nullptr; + ScopedAcquire _lock(&queue_lock()); + queue_count()--; + if (queue_count() == 0) { + core::InterruptSignal::DestroyEvent(queue_event()); + queue_event() = nullptr; } }); @@ -251,20 +246,20 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr }); if (core::g_use_interrupt_wait) { - ScopedAcquire _lock(&queue_lock_); - queue_count_++; - if (queue_event_ == nullptr) { - assert(queue_count_ == 1 && "Inconsistency in queue event reference counting found.\n"); + ScopedAcquire _lock(&queue_lock()); + queue_count()++; + if (queue_event() == nullptr) { + assert(queue_count() == 1 && "Inconsistency in queue event reference counting found.\n"); - queue_event_ = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false); - if (queue_event_ == nullptr) + queue_event() = core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false); + if (queue_event() == nullptr) throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Queue event creation failed.\n"); } - auto Signal = new core::InterruptSignal(0, queue_event_); + auto Signal = new core::InterruptSignal(0, queue_event()); assert(Signal != nullptr && "Should have thrown!\n"); amd_queue_.queue_inactive_signal = core::InterruptSignal::Convert(Signal); - exception_signal_ = new core::InterruptSignal(0, queue_event_); + exception_signal_ = new core::InterruptSignal(0, queue_event()); assert(exception_signal_ != nullptr && "Should have thrown!\n"); } else { EventGuard.Dismiss(); @@ -284,7 +279,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr if (core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) { queue_rsrc.ErrorReason = &exception_signal_->signal_.value; kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_, - ring_buf_alloc_bytes_, queue_event_, &queue_rsrc); + ring_buf_alloc_bytes_, queue_event(), &queue_rsrc); } else { kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_, ring_buf_alloc_bytes_, NULL, &queue_rsrc); @@ -382,11 +377,11 @@ AqlQueue::~AqlQueue() { exception_signal_->DestroySignal(); HSA::hsa_signal_destroy(amd_queue_.queue_inactive_signal); if (core::g_use_interrupt_wait) { - ScopedAcquire lock(&queue_lock_); - queue_count_--; - if (queue_count_ == 0) { - core::InterruptSignal::DestroyEvent(queue_event_); - queue_event_ = nullptr; + ScopedAcquire lock(&queue_lock()); + queue_count()--; + if (queue_count() == 0) { + core::InterruptSignal::DestroyEvent(queue_event()); + queue_event() = nullptr; } } agent_->system_deallocator()(pm4_ib_buf_); diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp index 51814d0778..36d21fa1ca 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp @@ -54,7 +54,8 @@ namespace rocr { namespace AMD { static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID; -static std::string kBlitKernelSource(R"( +static std::string& kBlitKernelSource() { + static std::string kBlitKernelSource_(R"( // Compatibility function for GFXIP 7. function s_load_dword_offset(byte_offset) @@ -874,29 +875,44 @@ L_FILL_PHASE_2_DONE: s_endpgm end )"); + return kBlitKernelSource_; +} // Search kernel source for variable definition and return value. int GetKernelSourceParam(const char* paramName) { std::stringstream paramDef; paramDef << "var " << paramName << " = "; - std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str()); + std::string::size_type paramDefLoc = + kBlitKernelSource().find(paramDef.str()); assert(paramDefLoc != std::string::npos); std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size(); std::string::size_type paramEndLoc = - kBlitKernelSource.find('\n', paramDefLoc); + kBlitKernelSource().find('\n', paramDefLoc); assert(paramDefLoc != std::string::npos); - std::string paramVal(&kBlitKernelSource[paramValLoc], - &kBlitKernelSource[paramEndLoc]); + std::string paramVal(&kBlitKernelSource()[paramValLoc], + &kBlitKernelSource()[paramEndLoc]); return std::stoi(paramVal); } -static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth"); -static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll"); -static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll"); -static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth"); -static int kFillUnroll = GetKernelSourceParam("kFillUnroll"); + +#define DEFINE_KERNEL_PARAM_FUNC(name) \ +static int& name() { \ + static std::once_flag initFlag; \ + static int val; \ + std::call_once(initFlag, [&]() { \ + val = GetKernelSourceParam(#name); \ + }); \ + return val; \ +} + +// Use the macro to define the functions +DEFINE_KERNEL_PARAM_FUNC(kCopyAlignedVecWidth) +DEFINE_KERNEL_PARAM_FUNC(kCopyAlignedUnroll) +DEFINE_KERNEL_PARAM_FUNC(kCopyMisalignedUnroll) +DEFINE_KERNEL_PARAM_FUNC(kFillVecWidth) +DEFINE_KERNEL_PARAM_FUNC(kFillUnroll) static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) { return (v >> pos) & ((1 << width) - 1); @@ -1093,7 +1109,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand( // Phase 2 (unrolled dwordx4 copy) ends when last whole block fits. uint64_t phase2_block = num_workitems * sizeof(uint32_t) * - kCopyAlignedUnroll * kCopyAlignedVecWidth; + kCopyAlignedUnroll() * kCopyAlignedVecWidth(); uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block; // Phase 3 (dword copy) ends when last whole dword fits. @@ -1125,7 +1141,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand( uintptr_t src_start = uintptr_t(src); uintptr_t dst_start = uintptr_t(dst); uint64_t phase1_block = - num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll; + num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll(); uint64_t phase1_size = (size / phase1_block) * phase1_block; args->copy_misaligned.phase1_src_start = src_start; @@ -1164,7 +1180,7 @@ hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value, uint64_t fill_size = count * sizeof(uint32_t); uint64_t phase1_block = - num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth; + num_workitems * sizeof(uint32_t) * kFillUnroll() * kFillVecWidth(); uint64_t phase1_size = (fill_size / phase1_block) * phase1_block; KernelArgs* args = ObtainAsyncKernelCopyArg(); diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 843c29ffef..38f35432d4 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -70,6 +70,7 @@ #include "core/inc/amd_trap_handler_v1.h" #include "core/inc/amd_blit_shaders.h" +#include "core/inc/hsa_api_trace_int.h" // Generated header #include "amd_trap_handler_v2.h" #include "amd_blit_shaders_v2.h" @@ -88,9 +89,6 @@ #define DEFAULT_SCRATCH_SINGLE_LIMIT_ASYNC_PER_XCC (1 << 30) // 1 GB namespace rocr { -namespace core { -extern HsaApiTable hsa_internal_api_table_; -} // namespace core namespace AMD { const uint64_t CP_DMA_DATA_TRANSFER_CNT_MAX = (1 << 26); @@ -1397,15 +1395,15 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { ((uint8_t*)value)[index] |= 1 << subBit; }; - if (core::hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) { + if (core::hsa_internal_api_table().finalizer_api.hsa_ext_program_finalize_fn != NULL) { setFlag(HSA_EXTENSION_FINALIZER); } - if (core::hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) { + if (core::hsa_internal_api_table().image_api.hsa_ext_image_create_fn != NULL) { setFlag(HSA_EXTENSION_IMAGES); } - if (core::hsa_internal_api_table_.pcs_api.hsa_ven_amd_pcs_iterate_configuration_fn != NULL) { + if (core::hsa_internal_api_table().pcs_api.hsa_ven_amd_pcs_iterate_configuration_fn != NULL) { setFlag(HSA_EXTENSION_AMD_PC_SAMPLING); } diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index 6ac7e55ccf..5bce5c7991 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -57,7 +57,6 @@ namespace AMD { // Tracks aggregate size of system memory available on platform size_t MemoryRegion::max_sysmem_alloc_size_ = 0; -size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE); bool MemoryRegion::RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags) { assert(ptr != NULL); @@ -125,7 +124,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, virtual_size_ = kGpuVmSize; } else if (IsSystem()) { - mem_flag_.ui32.PageSize = MemoryRegion::kPageSize_; + mem_flag_.ui32.PageSize = MemoryRegion::kPageSize(); mem_flag_.ui32.NoSubstitute = 0; mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; @@ -138,7 +137,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, // Adjust allocatable size per page align - max_single_alloc_size_ = AlignDown(static_cast(GetPhysicalSize()), kPageSize_); + max_single_alloc_size_ = AlignDown(static_cast(GetPhysicalSize()), kPageSize()); // Keep track of total system memory available // @note: System memory is surfaced as both coarse @@ -150,7 +149,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, } assert(GetVirtualSize() != 0); - assert(IsMultipleOf(max_single_alloc_size_, kPageSize_)); + assert(IsMultipleOf(max_single_alloc_size_, kPageSize())); } MemoryRegion::~MemoryRegion() {} @@ -177,7 +176,7 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, return HSA_STATUS_ERROR_INVALID_ALLOCATION; } - size = AlignUp(size, kPageSize_); + size = AlignUp(size, kPageSize()); return core::Runtime::runtime_singleton_->AgentDriver(owner()->driver_type) .AllocateMemory(*this, alloc_flags, address, size, agent_node_id); @@ -279,7 +278,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_HEAPTYPE_DEVICE_SVM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: - *((size_t*)value) = kPageSize_; + *((size_t*)value) = kPageSize(); break; default: *((size_t*)value) = 0; @@ -292,7 +291,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_HEAPTYPE_DEVICE_SVM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: - *((size_t*)value) = kPageSize_; + *((size_t*)value) = kPageSize(); break; default: *((size_t*)value) = 0; @@ -361,12 +360,12 @@ hsa_status_t MemoryRegion::GetPoolInfo(hsa_amd_memory_pool_info_t attribute, case HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: - *((size_t*)value) = kPageSize_; + *((size_t*)value) = kPageSize(); break; case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((size_t*)value) = core::Runtime::runtime_singleton_->flag().disable_fragment_alloc() - ? kPageSize_ + ? kPageSize() : fragment_allocator_.default_block_size(); break; default: diff --git a/runtime/hsa-runtime/core/runtime/default_signal.cpp b/runtime/hsa-runtime/core/runtime/default_signal.cpp index b3e5a23f28..0c5e7ca596 100644 --- a/runtime/hsa-runtime/core/runtime/default_signal.cpp +++ b/runtime/hsa-runtime/core/runtime/default_signal.cpp @@ -51,9 +51,6 @@ namespace rocr { namespace core { -int DefaultSignal::rtti_id_ = 0; -int BusyWaitSignal::rtti_id_ = 0; - BusyWaitSignal::BusyWaitSignal(SharedSignal* abi_block, bool enableIPC) : Signal(abi_block, enableIPC) { signal_.kind = AMD_SIGNAL_KIND_USER; diff --git a/runtime/hsa-runtime/core/runtime/host_queue.cpp b/runtime/hsa-runtime/core/runtime/host_queue.cpp index aaee7933ee..43de8baca3 100644 --- a/runtime/hsa-runtime/core/runtime/host_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/host_queue.cpp @@ -48,9 +48,6 @@ namespace rocr { namespace core { -int HostQueue::rtti_id_ = 0; -std::atomic HostQueue::queue_count_(0x80000000); - HostQueue::HostQueue(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, uint32_t features, hsa_signal_t doorbell_signal) : Queue(), size_(ring_size) { diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index 02d1e6bee9..8e42cae8b5 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -64,8 +64,15 @@ hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifie namespace core { -HsaApiTable hsa_api_table_; -HsaApiTable hsa_internal_api_table_; +HsaApiTable& hsa_api_table() { + static HsaApiTable table; + return table; +} + +HsaApiTable& hsa_internal_api_table() { + static HsaApiTable table; + return table; +} HsaApiTable::HsaApiTable() { Init(); @@ -483,7 +490,7 @@ void HsaApiTable::UpdateTools() { } void LoadInitialHsaApiTable() { - hsa_table_interface_init(&hsa_api_table_.hsa_api); + hsa_table_interface_init(&hsa_api_table().hsa_api); } } // namespace core diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp index d872e485b6..8271176e2a 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp @@ -119,28 +119,28 @@ void ExtensionEntryPoints::InitPcSamplingExtTable() { // Initialize Amd Ext table for Api related to Images void ExtensionEntryPoints::InitAmdExtTable() { - hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null; - hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null; + hsa_api_table().amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null; + hsa_internal_api_table().amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null; } // Update Amd Ext table for Api related to Images. // @note: Interface should be updated when Amd Ext table // begins hosting Api's from other extension libraries void ExtensionEntryPoints::UpdateAmdExtTable(decltype(::hsa_amd_image_create)* func_ptr) { - assert(hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn == + assert(hsa_api_table().amd_ext_api.hsa_amd_image_create_fn == (decltype(hsa_amd_image_create)*)hsa_ext_null && "Duplicate load of extension import."); - assert(hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn == + assert(hsa_internal_api_table().amd_ext_api.hsa_amd_image_create_fn == (decltype(hsa_amd_image_create)*)hsa_ext_null && "Duplicate load of extension import."); - hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = func_ptr; - hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = func_ptr; + hsa_api_table().amd_ext_api.hsa_amd_image_create_fn = func_ptr; + hsa_internal_api_table().amd_ext_api.hsa_amd_image_create_fn = func_ptr; } void ExtensionEntryPoints::UnloadImage() { InitAmdExtTable(); InitImageExtTable(); - core::hsa_internal_api_table_.Reset(); + core::hsa_internal_api_table().Reset(); #ifdef HSA_IMAGE_SUPPORT rocr::image::ReleaseImageRsrcs(); #endif @@ -172,7 +172,7 @@ void ExtensionEntryPoints::Unload() { InitPcSamplingExtTable(); InitImageExtTable(); InitAmdExtTable(); - core::hsa_internal_api_table_.Reset(); + core::hsa_internal_api_table().Reset(); } bool ExtensionEntryPoints::LoadImage() { @@ -193,7 +193,7 @@ bool ExtensionEntryPoints::LoadImage() { image_api.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION; // Update private copy of Api table with handle for Image extensions - hsa_internal_api_table_.CloneExts(&image_api, + hsa_internal_api_table().CloneExts(&image_api, core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID); // Update Amd Ext Api table Api that deals with Images @@ -215,7 +215,8 @@ void ExtensionEntryPoints::LoadPcSampling() { pcs_api.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION; // Update private copy of Api table with handle for Image extensions - hsa_internal_api_table_.CloneExts(&pcs_api, core::HsaApiTable::HSA_EXT_PC_SAMPLING_API_TABLE_ID); + hsa_internal_api_table().CloneExts(&pcs_api, + core::HsaApiTable::HSA_EXT_PC_SAMPLING_API_TABLE_ID); #endif } @@ -287,12 +288,12 @@ bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) { finalizer_api.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION; // Update handle of table of HSA extensions - hsa_internal_api_table_.CloneExts(&finalizer_api, + hsa_internal_api_table().CloneExts(&finalizer_api, core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID); ptr = os::GetExportAddress(lib, "Load"); if (ptr != NULL) { - ((Load_t)ptr)(&core::hsa_internal_api_table_.hsa_api); + ((Load_t)ptr)(&core::hsa_internal_api_table().hsa_api); } return true; diff --git a/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/runtime/hsa-runtime/core/runtime/intercept_queue.cpp index 0d08db7c13..a86dabb3c2 100644 --- a/runtime/hsa-runtime/core/runtime/intercept_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/intercept_queue.cpp @@ -80,8 +80,6 @@ static const uint16_t kBarrierHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKE (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -int InterceptQueue::rtti_id_ = 0; - bool InterceptQueue::IsPendingRetryPoint(uint64_t wrapped_current_read_index) const { // This function is intended to determine if the last retry barrier packet // has definitely not been processed in order to avoid putting multiple retry diff --git a/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp b/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp index 428d7ab99d..6acb75c2e4 100644 --- a/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp +++ b/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp @@ -74,8 +74,6 @@ void InterruptSignal::EventPool::free(HsaEvent* evt) { events_.push_back(unique_event_ptr(evt)); } -int InterruptSignal::rtti_id_ = 0; - HsaEvent* InterruptSignal::CreateEvent(HSA_EVENTTYPE type, bool manual_reset) { HsaEventDescriptor event_descriptor; event_descriptor.EventType = type; diff --git a/runtime/hsa-runtime/core/runtime/ipc_signal.cpp b/runtime/hsa-runtime/core/runtime/ipc_signal.cpp index 485a5c915a..2c22918053 100644 --- a/runtime/hsa-runtime/core/runtime/ipc_signal.cpp +++ b/runtime/hsa-runtime/core/runtime/ipc_signal.cpp @@ -50,7 +50,6 @@ namespace rocr { namespace core { -int IPCSignal::rtti_id_ = 0; KernelMutex IPCSignal::lock_; SharedMemory::SharedMemory(const hsa_amd_ipc_memory_t* handle, size_t len) { diff --git a/runtime/hsa-runtime/core/runtime/isa.cpp b/runtime/hsa-runtime/core/runtime/isa.cpp index 65cc8bf7d1..140802e10c 100755 --- a/runtime/hsa-runtime/core/runtime/isa.cpp +++ b/runtime/hsa-runtime/core/runtime/isa.cpp @@ -199,12 +199,14 @@ hsa_round_method_t Isa::GetRoundMethod( } const Isa *IsaRegistry::GetIsa(const std::string &full_name) { - auto isareg_iter = supported_isas_.find(full_name); - return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second; + auto isareg_iter = GetSupportedIsas().find(full_name); + return isareg_iter == GetSupportedIsas().end() ? + nullptr : &isareg_iter->second; } const Isa *IsaRegistry::GetIsa(const Isa::Version &version, IsaFeature sramecc, IsaFeature xnack) { - auto isareg_iter = std::find_if(supported_isas_.begin(), supported_isas_.end(), + auto isareg_iter = std::find_if(GetSupportedIsas().begin(), + GetSupportedIsas().end(), [&](const IsaMap::value_type& isareg) { return isareg.second.GetVersion() == version && (isareg.second.GetSramecc() == IsaFeature::Unsupported || @@ -212,13 +214,12 @@ const Isa *IsaRegistry::GetIsa(const Isa::Version &version, IsaFeature sramecc, (isareg.second.GetXnack() == IsaFeature::Unsupported || isareg.second.GetXnack() == xnack); }); - return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second; + return isareg_iter == GetSupportedIsas().end() ? + nullptr : &isareg_iter->second; } -const IsaRegistry::IsaMap IsaRegistry::supported_isas_ = - IsaRegistry::GetSupportedIsas(); -const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() { +const IsaRegistry::IsaMap& IsaRegistry::GetSupportedIsas() { // agent, and vendor name length limit excluding terminating nul character. constexpr size_t hsa_name_size = 63; @@ -236,11 +237,16 @@ constexpr size_t hsa_name_size = 63; amd_amdgpu_##maj##min##stp##_SRAMECC_##sramecc##_XNACK_##xnack##_WAVEFRONTSIZE_##wavefrontsize.GetIsaName(), \ amd_amdgpu_##maj##min##stp##_SRAMECC_##sramecc##_XNACK_##xnack##_WAVEFRONTSIZE_##wavefrontsize)); \ - IsaMap supported_isas; - IsaFeature unsupported = IsaFeature::Unsupported; - IsaFeature any = IsaFeature::Any; - IsaFeature disabled = IsaFeature::Disabled; - IsaFeature enabled = IsaFeature::Enabled; + static IsaMap supported_isas; + + if (supported_isas.size() > 0) { + return supported_isas; + } + + const IsaFeature unsupported = IsaFeature::Unsupported; + const IsaFeature any = IsaFeature::Any; + const IsaFeature disabled = IsaFeature::Disabled; + const IsaFeature enabled = IsaFeature::Enabled; // Target ID Version SRAMECC XNACK ISAREG_ENTRY_GEN("gfx700", 7, 0, 0, unsupported, unsupported, 64) diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 6c49feab0f..e03ca5095c 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -40,8 +40,6 @@ // //////////////////////////////////////////////////////////////////////////////// -#include "core/inc/runtime.h" - #include #include #include @@ -50,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +58,9 @@ #include #include +#include "core/inc/runtime.h" +#include "core/inc/hsa_table_interface.h" + #if defined(HSA_ROCPROFILER_REGISTER) && HSA_ROCPROFILER_REGISTER > 0 #include #endif @@ -77,6 +79,7 @@ #include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" #include "core/inc/amd_core_dump.hpp" +#include "core/inc/host_queue.h" #ifndef HSA_VERSION_MAJOR #define HSA_VERSION_MAJOR 1 @@ -97,16 +100,20 @@ ROCPROFILER_REGISTER_DEFINE_IMPORT(hsa, ROCP_REG_VERSION) const char rocrbuildid[] __attribute__((used)) = "ROCR BUILD ID: " STRING(ROCR_BUILD_ID); +extern r_debug _amdgpu_r_debug; +extern void _loader_debug_state(); + namespace rocr { namespace core { -bool g_use_interrupt_wait = true; -bool g_use_mwaitx = true; - +bool g_use_interrupt_wait; +bool g_use_mwaitx; Runtime* Runtime::runtime_singleton_ = NULL; -KernelMutex Runtime::bootstrap_lock_; -static bool loaded = true; +__forceinline static bool& loaded() { + static bool loaded_ = true; + return loaded_; +} class RuntimeCleanup { public: @@ -115,7 +122,7 @@ class RuntimeCleanup { delete Runtime::runtime_singleton_; } - loaded = false; + loaded() = false; } }; @@ -123,9 +130,9 @@ static RuntimeCleanup cleanup_at_unload_; hsa_status_t Runtime::Acquire() { // Check to see if HSA has been cleaned up (process exit) - if (!loaded) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + if (!loaded()) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - ScopedAcquire boot(&bootstrap_lock_); + ScopedAcquire boot(&bootstrap_lock()); if (runtime_singleton_ == NULL) { memset(log_flags, 0, sizeof(log_flags)); @@ -153,9 +160,9 @@ hsa_status_t Runtime::Acquire() { hsa_status_t Runtime::Release() { // Check to see if HSA has been cleaned up (process exit) - if (!loaded) return HSA_STATUS_SUCCESS; + if (!loaded()) return HSA_STATUS_SUCCESS; - ScopedAcquire boot(&bootstrap_lock_); + ScopedAcquire boot(&bootstrap_lock()); if (runtime_singleton_ == nullptr) return HSA_STATUS_ERROR_NOT_INITIALIZED; @@ -747,11 +754,11 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { ((uint8_t*)value)[index] |= 1 << subBit; }; - if (hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) { + if (hsa_internal_api_table().finalizer_api.hsa_ext_program_finalize_fn != NULL) { setFlag(HSA_EXTENSION_FINALIZER); } - if (hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) { + if (hsa_internal_api_table().image_api.hsa_ext_image_create_fn != NULL) { setFlag(HSA_EXTENSION_IMAGES); } @@ -1944,6 +1951,16 @@ Runtime::Runtime() asyncSignals_.monitor_exceptions = false; asyncExceptions_.monitor_exceptions = true; + g_use_interrupt_wait = true; + g_use_mwaitx = true; + ::_amdgpu_r_debug = {10, + nullptr, + reinterpret_cast( + &_loader_debug_state), + r_debug::RT_CONSISTENT, + 0}; + + log_file = stderr; } hsa_status_t Runtime::Load() { @@ -2068,17 +2085,17 @@ void Runtime::LoadExtensions() { // Skipping finalizer loading since finalizer is no longer distributed. // LinkExts will expose the finalizer-not-present implementation. // extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]); - hsa_api_table_.LinkExts(&extensions_.finalizer_api, + hsa_api_table().LinkExts(&extensions_.finalizer_api, core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID); // Update Hsa Api Table with handle of Image extension Apis extensions_.LoadImage(); - hsa_api_table_.LinkExts(&extensions_.image_api, + hsa_api_table().LinkExts(&extensions_.image_api, core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID); // Update Hsa Api Table with handle of PCS extension Apis extensions_.LoadPcSampling(); - hsa_api_table_.LinkExts(&extensions_.pcs_api, + hsa_api_table().LinkExts(&extensions_.pcs_api, core::HsaApiTable::HSA_EXT_PC_SAMPLING_API_TABLE_ID); } @@ -2216,7 +2233,7 @@ void Runtime::LoadTools() { #if defined(HSA_ROCPROFILER_REGISTER) && HSA_ROCPROFILER_REGISTER > 0 if (!flag().disable_tool_register()) { - auto* profiler_api_table_ = static_cast(&hsa_api_table_); + auto* profiler_api_table_ = static_cast(&hsa_api_table()); auto lib_id = rocprofiler_register_library_indentifier_t{}; auto rocp_reg_status = rocprofiler_register_library_api_table("hsa", &ROCPROFILER_REGISTER_IMPORT_FUNC(hsa), @@ -2272,8 +2289,8 @@ void Runtime::LoadTools() { } // Discover loaded tools. - std::vector loaded = os::GetLoadedToolsLib(); - for(auto& handle : loaded) { + std::vector loaded_hds = os::GetLoadedToolsLib(); + for(auto& handle : loaded_hds) { const uint32_t* order = (const uint32_t*)os::GetExportAddress(handle, "HSA_AMD_TOOL_PRIORITY"); if(order) { sorted.push_back(lib_t(handle, *order+env_count, os::GetLibraryName(handle))); @@ -2333,8 +2350,8 @@ void Runtime::LoadTools() { os::CloseLib(tool); continue; } - if (!ld(&hsa_api_table_.hsa_api, - hsa_api_table_.hsa_api.version.major_id, + if (!ld(&hsa_api_table().hsa_api, + hsa_api_table().hsa_api.version.major_id, failed.size(), failed.data())) { failed.push_back(lib.name_.c_str()); os::CloseLib(tool); @@ -2375,7 +2392,7 @@ void Runtime::UnloadTools() { } // Reset API table in case some tool doesn't cleanup properly - hsa_api_table_.Reset(); + hsa_api_table().Reset(); } void Runtime::CloseTools() { diff --git a/runtime/hsa-runtime/core/runtime/signal.cpp b/runtime/hsa-runtime/core/runtime/signal.cpp index e8fbe76993..8a60e25994 100644 --- a/runtime/hsa-runtime/core/runtime/signal.cpp +++ b/runtime/hsa-runtime/core/runtime/signal.cpp @@ -64,7 +64,7 @@ void SharedSignalPool_t::clear() { capacity - free_list_.size()); } - for (auto& block : block_list_) free_(block.first); + for (auto& block : block_list_) free_()(block.first); block_list_.clear(); free_list_.clear(); } @@ -73,15 +73,15 @@ SharedSignal* SharedSignalPool_t::alloc() { ScopedAcquire lock(&lock_); if (free_list_.empty()) { SharedSignal* block = reinterpret_cast( - allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0)); + allocate_()(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0)); if (block == nullptr) { block_size_ = minblock_; block = reinterpret_cast( - allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0)); + allocate_()(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0)); if (block == nullptr) throw std::bad_alloc(); } - MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(block); }); + MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_()(block); }); block_list_.push_back(std::make_pair(block, block_size_)); throwGuard.Dismiss(); diff --git a/runtime/hsa-runtime/image/image_runtime.cpp b/runtime/hsa-runtime/image/image_runtime.cpp index 3e015be94b..a7cd66d035 100644 --- a/runtime/hsa-runtime/image/image_runtime.cpp +++ b/runtime/hsa-runtime/image/image_runtime.cpp @@ -61,9 +61,6 @@ namespace rocr { namespace image { -std::atomic ImageRuntime::instance_(NULL); -std::mutex ImageRuntime::instance_mutex_; - hsa_status_t FindKernelArgPool(hsa_amd_memory_pool_t pool, void* data) { assert(data != nullptr); @@ -154,13 +151,13 @@ hsa_status_t ImageRuntime::CreateImageManager(hsa_agent_t agent, void* data) { } ImageRuntime* ImageRuntime::instance() { - ImageRuntime* instance = instance_.load(std::memory_order_acquire); + ImageRuntime* instance = get_instance().load(std::memory_order_acquire); if (instance == NULL) { // Protect the initialization from multi threaded access. - std::lock_guard lock(instance_mutex_); + std::lock_guard lock(instance_mutex()); // Make sure we are not initializing it twice. - instance = instance_.load(std::memory_order_relaxed); + instance = get_instance().load(std::memory_order_relaxed); if (instance != NULL) { return instance; } @@ -194,19 +191,19 @@ ImageRuntime* ImageRuntime::CreateSingleton() { assert(instance->kernarg_pool_.handle != 0); assert(instance->image_managers_.size() != 0); - instance_.store(instance, std::memory_order_release); + get_instance().store(instance, std::memory_order_release); return instance; } void ImageRuntime::DestroySingleton() { - ImageRuntime* instance = instance_.load(std::memory_order_acquire); + ImageRuntime* instance = get_instance().load(std::memory_order_acquire); if (instance == NULL) { return; } instance->Cleanup(); - instance_.store(NULL, std::memory_order_release); + get_instance().store(NULL, std::memory_order_release); delete instance; } diff --git a/runtime/hsa-runtime/image/image_runtime.h b/runtime/hsa-runtime/image/image_runtime.h index f6f3dafcf4..520b6f80aa 100644 --- a/runtime/hsa-runtime/image/image_runtime.h +++ b/runtime/hsa-runtime/image/image_runtime.h @@ -163,9 +163,15 @@ class ImageRuntime { void Cleanup(); /// Pointer to singleton object. - static std::atomic instance_; + static __forceinline std::atomic& get_instance() { + static std::atomic instance_(NULL); + return instance_; + } - static std::mutex instance_mutex_; + static __forceinline std::mutex& instance_mutex() { + static std::mutex instance_mutex_; + return instance_mutex_; + } /// @brief Contains mapping of agent and its corresponding ::ImageManager /// object. diff --git a/runtime/hsa-runtime/loader/executable.cpp b/runtime/hsa-runtime/loader/executable.cpp index 4ad78c4a57..36b285beac 100644 --- a/runtime/hsa-runtime/loader/executable.cpp +++ b/runtime/hsa-runtime/loader/executable.cpp @@ -68,7 +68,7 @@ using namespace rocr::amd::hsa::common; // Having a side effect prevents call site optimization that allows removal of a noinline function call // with no side effect. -__attribute__((noinline)) static void _loader_debug_state() { +__attribute__((noinline)) void _loader_debug_state() { static volatile int function_needs_a_side_effect = 0; function_needs_a_side_effect ^= 1; } @@ -84,12 +84,12 @@ __attribute__((noinline)) static void _loader_debug_state() { // 9: New trap handler ABI. For gfx11: Save PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped. // 10: New trap handler ABI. Set status.skip_export when halting the wave. // For gfx940, set ttmp6[31] = 0 if ttmp11[31] == 0. -HSA_API r_debug _amdgpu_r_debug = {10, - nullptr, - reinterpret_cast(&_loader_debug_state), - r_debug::RT_CONSISTENT, - 0}; -static link_map* r_debug_tail = nullptr; + +HSA_API r_debug _amdgpu_r_debug; +static __forceinline link_map*& r_debug_tail() { + static link_map* r_debug_tail_ = nullptr; + return r_debug_tail_; +} namespace rocr { namespace amd { @@ -175,7 +175,7 @@ void Loader::Destroy(Loader *loader) // Loader resets the link_map, but the executables and loaded code objects are not deleted. _amdgpu_r_debug.r_map = nullptr; _amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT; - r_debug_tail = nullptr; + r_debug_tail() = nullptr; delete loader; } @@ -201,21 +201,21 @@ Executable* AmdHsaCodeLoader::CreateExecutable( } static void AddCodeObjectInfoIntoDebugMap(link_map* map) { - if (r_debug_tail) { - r_debug_tail->l_next = map; - map->l_prev = r_debug_tail; + if (r_debug_tail()) { + r_debug_tail()->l_next = map; + map->l_prev = r_debug_tail(); map->l_next = nullptr; } else { _amdgpu_r_debug.r_map = map; map->l_prev = nullptr; map->l_next = nullptr; } - r_debug_tail = map; + r_debug_tail() = map; } static void RemoveCodeObjectInfoFromDebugMap(link_map* map) { - if (r_debug_tail == map) { - r_debug_tail = map->l_prev; + if (r_debug_tail() == map) { + r_debug_tail() = map->l_prev; } if (_amdgpu_r_debug.r_map == map) { _amdgpu_r_debug.r_map = map->l_next; diff --git a/runtime/hsa-runtime/pcs/pcs_runtime.cpp b/runtime/hsa-runtime/pcs/pcs_runtime.cpp index 9d453bb319..31a7fe94d3 100644 --- a/runtime/hsa-runtime/pcs/pcs_runtime.cpp +++ b/runtime/hsa-runtime/pcs/pcs_runtime.cpp @@ -57,17 +57,15 @@ do { \ if ((ptr) == NULL) return HSA_STATUS_ERROR_INVALID_ARGUMENT; \ } while (false) -std::atomic PcsRuntime::instance_(NULL); -std::mutex PcsRuntime::instance_mutex_; PcsRuntime* PcsRuntime::instance() { - PcsRuntime* instance = instance_.load(std::memory_order_acquire); + PcsRuntime* instance = get_instance().load(std::memory_order_acquire); if (instance == NULL) { // Protect the initialization from multi threaded access. - std::lock_guard lock(instance_mutex_); + std::lock_guard lock(instance_mutex()); // Make sure we are not initializing it twice. - instance = instance_.load(std::memory_order_relaxed); + instance = get_instance().load(std::memory_order_relaxed); if (instance != NULL) { return instance; } @@ -84,17 +82,17 @@ PcsRuntime* PcsRuntime::instance() { PcsRuntime* PcsRuntime::CreateSingleton() { PcsRuntime* instance = new PcsRuntime(); - instance_.store(instance, std::memory_order_release); + get_instance().store(instance, std::memory_order_release); return instance; } void PcsRuntime::DestroySingleton() { - PcsRuntime* instance = instance_.load(std::memory_order_acquire); + PcsRuntime* instance = get_instance().load(std::memory_order_acquire); if (instance == NULL) { return; } - instance_.store(NULL, std::memory_order_release); + get_instance().store(NULL, std::memory_order_release); delete instance; } diff --git a/runtime/hsa-runtime/pcs/pcs_runtime.h b/runtime/hsa-runtime/pcs/pcs_runtime.h index 6fa489c738..b860ae6a49 100644 --- a/runtime/hsa-runtime/pcs/pcs_runtime.h +++ b/runtime/hsa-runtime/pcs/pcs_runtime.h @@ -152,9 +152,14 @@ class PcsRuntime { static PcsRuntime* CreateSingleton(); /// Pointer to singleton object. - static std::atomic instance_; - static std::mutex instance_mutex_; - + static __forceinline std::atomic& get_instance() { + static std::atomic instance_(nullptr); + return instance_; + } + static __forceinline std::mutex& instance_mutex() { + static std::mutex instance_mutex_; + return instance_mutex_; +} // Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle std::map pc_sampling_; KernelMutex pc_sampling_lock_;