From aef18896dd12f57df3cd1ec726495fb75f7c5ffc Mon Sep 17 00:00:00 2001 From: Giovanni Lenzi Baraldi Date: Thu, 24 Oct 2024 16:58:07 -0300 Subject: [PATCH] SWDEV-489158: Optimizing counter collection performance (#1150) * SWDEV-489158: Optimizing counter collection performance * Static initializer fix * adding sched_yield+sleep --- source/lib/rocprofiler-sdk/hsa/queue.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/source/lib/rocprofiler-sdk/hsa/queue.cpp b/source/lib/rocprofiler-sdk/hsa/queue.cpp index 211f31d653..00ff0168e6 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -67,6 +67,14 @@ namespace hsa { namespace { +static std::atomic& +get_balanced_signal_slots() +{ + constexpr int64_t NUM_SIGNALS = 16; + static auto*& atomic = common::static_object>::construct(NUM_SIGNALS); + return *atomic; +} + template inline bool context_filter(const context::context* ctx, DomainT domain, Args... args) @@ -106,6 +114,8 @@ AsyncSignalHandler(hsa_signal_value_t /*signal_v*/, void* data) return false; } + get_balanced_signal_slots().fetch_add(1); + auto& queue_info_session = *static_cast(data); auto dispatch_time = kernel_dispatch::get_dispatch_time(queue_info_session); @@ -342,6 +352,13 @@ WriteInterceptor(const void* packets, thr_id, ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH); + // If there is a lot of contention for HSA signals, then schedule out the thread + if(get_balanced_signal_slots().fetch_sub(1) <= 0) + { + sched_yield(); + std::this_thread::sleep_for(std::chrono::microseconds(1)); + } + // Stores the instrumentation pkt (i.e. AQL packets for counter collection) // along with an ID of the client we got the packet from (this will be returned via // completed_cb_t)