From d05031db89f942111d46cc215a4ffe14e3cb5855 Mon Sep 17 00:00:00 2001 From: Benjamin Welton Date: Thu, 19 Oct 2023 11:18:42 -0700 Subject: [PATCH] Limit the number of HSA signals that are active (#140) * Limit the number of HSA signals that are active There is a hard limit currently to the number of signals that HSA allows to be created (before weird stuff happens such as hangs or straight up crashes in HSA). While there is some work going on to fix this in HSA/AQL. Lets limit the number we create. Increased the counter colleciton example to 200K launches, which with this change no longer hangs/crashes randomly in HSA. * source formatting (clang-format v11) (#142) Co-authored-by: bwelton * Up timout --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: bwelton [ROCm/rocprofiler-sdk commit: de685246a7c061809791a3470ec7ae745a82cb22] --- .../samples/counter_collection/CMakeLists.txt | 2 +- .../samples/counter_collection/main.cpp | 2 +- .../source/lib/common/utility.hpp | 54 +++++++++++++++++++ .../source/lib/rocprofiler/hsa/queue.cpp | 19 +++++++ 4 files changed, 75 insertions(+), 2 deletions(-) diff --git a/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt b/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt index 3dd1a434e0..4e80703f07 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt @@ -47,7 +47,7 @@ set_tests_properties( counter-collection PROPERTIES TIMEOUT - 45 + 300 LABELS "samples" ENVIRONMENT diff --git a/projects/rocprofiler-sdk/samples/counter_collection/main.cpp b/projects/rocprofiler-sdk/samples/counter_collection/main.cpp index 1a456e92ce..596d77c28c 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/main.cpp +++ b/projects/rocprofiler-sdk/samples/counter_collection/main.cpp @@ -40,7 +40,7 @@ kernelC(T* C_d, const T* A_d, size_t N) void launchKernals() { - const int NUM_LAUNCH = 1000; + const int NUM_LAUNCH = 200000; // Normal HIP Calls int* gpuMem; [[maybe_unused]] hipDeviceProp_t devProp; diff --git a/projects/rocprofiler-sdk/source/lib/common/utility.hpp b/projects/rocprofiler-sdk/source/lib/common/utility.hpp index 20361086f5..5d28f50e83 100644 --- a/projects/rocprofiler-sdk/source/lib/common/utility.hpp +++ b/projects/rocprofiler-sdk/source/lib/common/utility.hpp @@ -25,7 +25,9 @@ #include #include #include +#include #include +#include #include #include @@ -97,5 +99,57 @@ private: L _destroy_func; }; +/** + * Limits the number of active items to those set in capacity. + * If capacity is reached, will block until another caller + * removes active capacity. + */ +class active_capacity_gate +{ +public: + active_capacity_gate(size_t capacity) + : _capacity(capacity) + {} + void add_active(size_t size) + { + if(size >= _capacity) + { + throw std::runtime_error("Size exceeds gate capacity"); + } + + std::unique_lock lock(_m); + if(_count + size < _capacity) + { + _count += size; + return; + } + _waiters++; + _cv.wait(lock, [&]() { return _count + size < _capacity; }); + _waiters--; + _count += size; + } + + void remove_active(size_t size) + { + std::unique_lock lock(_m); + if(_count > size) + _count -= size; + else + _count = 0; + + if(_waiters > 0) + { + _cv.notify_all(); + } + } + +private: + size_t _count{0}; + size_t _capacity{0}; + size_t _waiters{0}; + std::mutex _m; + std::condition_variable _cv; +}; + } // namespace common } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp index afafd48a81..d2534353f3 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp @@ -31,6 +31,15 @@ namespace hsa { namespace { +common::active_capacity_gate& +signal_limiter() +{ + // Limit the maximun number of HSA signals created. + // There is a hard limit to the maximum that can exist. + static common::active_capacity_gate _gate(1024); + return _gate; +} + bool AsyncSignalHandler(hsa_signal_value_t, void* data) { @@ -59,15 +68,24 @@ AsyncSignalHandler(hsa_signal_value_t, void* data) } }); + size_t signals_to_remove = 0; // Delete signals and packets, signal we have completed. if(queue_info_session.interrupt_signal.handle != 0u) + { + signals_to_remove++; queue_info_session.queue.core_api().hsa_signal_destroy_fn( queue_info_session.interrupt_signal); + } if(queue_info_session.kernel_pkt.completion_signal.handle != 0u) { + signals_to_remove++; queue_info_session.queue.core_api().hsa_signal_destroy_fn( queue_info_session.kernel_pkt.completion_signal); } + if(signals_to_remove > 0) + { + signal_limiter().remove_active(signals_to_remove); + } queue_info_session.queue.async_complete(); delete static_cast(data); @@ -267,6 +285,7 @@ Queue::signal_async_handler(const hsa_signal_t& signal, Queue::queue_info_sessio void Queue::create_signal(uint32_t attribute, hsa_signal_t* signal) const { + signal_limiter().add_active(1); hsa_status_t status = _ext_api.hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal); LOG_IF(FATAL, status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) << "Error: hsa_amd_signal_create failed";