From a325f26c61940686e11d07a4d9367300ba9482c8 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Sun, 13 Nov 2022 14:37:07 -0600 Subject: [PATCH] Improve sampling allocator (#205) * Updated sampling - dynamic sampler is constructed with a shared pointer to an allocator instance - dynamic allocator handles multiple sampler - eliminates need for every per-thread dynamic sampler to start background allocator thread * Fix usage of tim::popen [ROCm/rocprofiler-systems commit: 2135f82ab8e121f30e9b510a1dde0279bd3a819e] --- .../rocprofiler-systems/external/timemory | 2 +- .../source/bin/omnitrace/omnitrace.cpp | 18 ++--- .../source/lib/omnitrace/library/config.cpp | 40 +++++++++++ .../source/lib/omnitrace/library/config.hpp | 6 ++ .../source/lib/omnitrace/library/sampling.cpp | 71 ++++++++++++++++++- 5 files changed, 124 insertions(+), 13 deletions(-) diff --git a/projects/rocprofiler-systems/external/timemory b/projects/rocprofiler-systems/external/timemory index 040456175a..a1538e4352 160000 --- a/projects/rocprofiler-systems/external/timemory +++ b/projects/rocprofiler-systems/external/timemory @@ -1 +1 @@ -Subproject commit 040456175a81d50beb2ed55d62c2a39f7644776e +Subproject commit a1538e4352d99a8f1758d8d70ec107be11a101d7 diff --git a/projects/rocprofiler-systems/source/bin/omnitrace/omnitrace.cpp b/projects/rocprofiler-systems/source/bin/omnitrace/omnitrace.cpp index 354e084b9c..92704e07b2 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace/omnitrace.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace/omnitrace.cpp @@ -2197,18 +2197,18 @@ main(int argc, char** argv) verbprintf(0, "Consider instrumenting the relevant libraries...\n"); verbprintf(0, "\n"); - using TIMEMORY_PIPE = tim::popen::TIMEMORY_PIPE; + auto cmdv_envp = std::array{}; + cmdv_envp.fill(nullptr); + cmdv_envp.at(0) = strdup("LD_TRACE_LOADED_OBJECTS=1"); + auto ldd = tim::popen::popen(cmdv0.c_str(), nullptr, cmdv_envp.data()); + auto linked_libs = tim::popen::read_ldd_fork(ldd); + auto perr = tim::popen::pclose(ldd); + for(auto& itr : cmdv_envp) + ::free(itr); - tim::set_env("LD_TRACE_LOADED_OBJECTS", "1", 1); - TIMEMORY_PIPE* ldd = tim::popen::popen(cmdv0.c_str()); - tim::set_env("LD_TRACE_LOADED_OBJECTS", "0", 1); - - strvec_t linked_libraries = tim::popen::read_ldd_fork(ldd); - - auto perr = tim::popen::pclose(ldd); if(perr != 0) perror("Error in omnitrace_fork"); - for(const auto& itr : linked_libraries) + for(const auto& itr : linked_libs) verbprintf(0, "\t%s\n", itr.c_str()); verbprintf(0, "\n"); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp index 38f20c24b9..62038d721b 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp @@ -266,6 +266,20 @@ configure_settings(bool _init) "Verbosity within the omnitrace-dl library", 0, "debugging", "libomnitrace-dl", "advanced"); + OMNITRACE_CONFIG_SETTING( + size_t, "OMNITRACE_NUM_THREADS_HINT", + "This is hint for how many threads are expected to be created in the " + "application. Setting this value allows omnitrace to preallocate resources " + "during initialization and warn about any potential issues. For example, when " + "call-stack sampling, each thread has a unique sampler instance which " + "communicates with an allocator instance running in a background thread. Each " + "allocator only handles N sampling instances (where N is the value of " + "OMNITRACE_SAMPLING_ALLOCATOR_SIZE). When this hint is set to >= the number of " + "threads that get sampled, omnitrace can start all the background threads during " + "initialization", + get_env("OMNITRACE_NUM_THREADS", 1), "threading", "performance", + "sampling", "debugging", "advanced"); + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_PERFETTO", "Enable perfetto backend", _default_perfetto_v, "backend", "perfetto"); @@ -484,6 +498,18 @@ configure_settings(bool _init) "Create entries for inlined functions when available", false, "sampling", "data", "advanced"); + OMNITRACE_CONFIG_SETTING( + size_t, "OMNITRACE_SAMPLING_ALLOCATOR_SIZE", + "The number of sampled threads handled by an allocator running in a background " + "thread. Each thread that is sampled communicates with an allocator running in a " + "background thread which handles storing/caching the data when it's buffer is " + "full. Setting this value too high (i.e. equal to the number of threads when the " + "thread count is high) may cause loss of data -- the sampler may fill a new " + "buffer and overwrite old buffer data before the allocator can process it. " + "Setting this value to 1 will result in a background allocator thread for every " + "thread started by the application.", + 8, "sampling", "debugging", "advanced"); + OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_SAMPLING_REALTIME", "Enable sampling frequency via a wall-clock timer on child threads. This may " @@ -1599,6 +1625,13 @@ get_use_rcclp() return static_cast&>(*_v->second).get(); } +size_t +get_num_threads_hint() +{ + static auto _v = get_config()->find("OMNITRACE_NUM_THREADS_HINT"); + return static_cast&>(*_v->second).get(); +} + bool get_critical_trace_debug() { @@ -1868,6 +1901,13 @@ get_sampling_include_inlines() return static_cast&>(*_v->second).get(); } +size_t +get_sampling_allocator_size() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_ALLOCATOR_SIZE"); + return std::max(static_cast&>(*_v->second).get(), 1); +} + int64_t get_critical_trace_count() { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp index 9ea9f730d4..c6863097ff 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp @@ -302,6 +302,12 @@ get_sampling_real_tids(); bool get_sampling_include_inlines(); +size_t +get_num_threads_hint(); + +size_t +get_sampling_allocator_size(); + double get_process_sampling_freq(); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp index da58f7e425..bbf580dda8 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp @@ -30,6 +30,7 @@ #include "library/debug.hpp" #include "library/ptl.hpp" #include "library/runtime.hpp" +#include "library/state.hpp" #include "library/thread_data.hpp" #include "library/thread_info.hpp" #include "library/tracing.hpp" @@ -105,9 +106,70 @@ namespace sampling { namespace { +using sampler_allocator_t = typename sampler_t::allocator_t; + +auto& +get_sampler_allocators() +{ + static auto _v = std::vector>{}; + return _v; +} + std::set configure(bool _setup, int64_t _tid = threading::get_id()); +void +configure_sampler_allocator(std::shared_ptr& _v) +{ + if(_v) return; + + OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + + _v = std::make_shared(); + _v->reserve(config::get_sampling_allocator_size()); +} + +void +configure_sampler_allocators() +{ + auto& _allocators = get_sampler_allocators(); + if(_allocators.empty()) + { + // avoid lock until necessary + auto_lock_t _alloc_lk{ type_mutex() }; + if(_allocators.empty()) + { + _allocators.resize(std::ceil(config::get_num_threads_hint() / + config::get_sampling_allocator_size())); + for(auto& itr : _allocators) + configure_sampler_allocator(itr); + } + } +} + +std::shared_ptr +get_sampler_allocator() +{ + configure_sampler_allocators(); + + auto& _allocators = get_sampler_allocators(); + + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + + auto_lock_t _lk{ type_mutex() }; + + for(auto& itr : _allocators) + { + if(!itr) configure_sampler_allocator(itr); + if(itr->size() < config::get_sampling_allocator_size()) return itr; + } + + auto& _v = _allocators.emplace_back(); + configure_sampler_allocator(_v); + return _v; +} + template void thread_sigmask(Args... _args) @@ -403,9 +465,12 @@ configure(bool _setup, int64_t _tid) auto _verbose = std::min(get_verbose() - 2, 2); if(get_debug_sampling()) _verbose = 2; + OMNITRACE_DEBUG("Requesting allocator for sampler on thread %lu...\n", _tid); + auto _alloc = get_sampler_allocator(); + OMNITRACE_DEBUG("Configuring sampler for thread %lu...\n", _tid); - sampling::sampler_instances::construct(construct_on_thread{ _tid }, "omnitrace", - _tid, _verbose); + sampling::sampler_instances::construct(construct_on_thread{ _tid }, _alloc, + "omnitrace", _tid, _verbose); _sampler->set_flags(SA_RESTART); _sampler->set_verbose(_verbose); @@ -651,7 +716,7 @@ post_process() "Getting sampler data for thread %lu...\n", i); _sampler->stop(); - auto& _raw_data = _sampler->get_data(); + auto _raw_data = _sampler->get_data(); for(auto litr : _loaded_data[i]) { while(!litr.is_empty())