Improve sampling allocator (#205)

* Updated sampling

- dynamic sampler is constructed with a shared pointer to an allocator instance
- dynamic allocator handles multiple sampler
  - eliminates need for every per-thread dynamic sampler to start background allocator thread

* Fix usage of tim::popen

[ROCm/rocprofiler-systems commit: 2135f82ab8]
This commit is contained in:
Jonathan R. Madsen
2022-11-13 14:37:07 -06:00
committed by GitHub
orang tua 84c14233b9
melakukan a325f26c61
5 mengubah file dengan 124 tambahan dan 13 penghapusan
Submodule projects/rocprofiler-systems/external/timemory updated: 040456175a...a1538e4352
@@ -2197,18 +2197,18 @@ main(int argc, char** argv)
verbprintf(0, "Consider instrumenting the relevant libraries...\n");
verbprintf(0, "\n");
using TIMEMORY_PIPE = tim::popen::TIMEMORY_PIPE;
auto cmdv_envp = std::array<char*, 2>{};
cmdv_envp.fill(nullptr);
cmdv_envp.at(0) = strdup("LD_TRACE_LOADED_OBJECTS=1");
auto ldd = tim::popen::popen(cmdv0.c_str(), nullptr, cmdv_envp.data());
auto linked_libs = tim::popen::read_ldd_fork(ldd);
auto perr = tim::popen::pclose(ldd);
for(auto& itr : cmdv_envp)
::free(itr);
tim::set_env("LD_TRACE_LOADED_OBJECTS", "1", 1);
TIMEMORY_PIPE* ldd = tim::popen::popen(cmdv0.c_str());
tim::set_env("LD_TRACE_LOADED_OBJECTS", "0", 1);
strvec_t linked_libraries = tim::popen::read_ldd_fork(ldd);
auto perr = tim::popen::pclose(ldd);
if(perr != 0) perror("Error in omnitrace_fork");
for(const auto& itr : linked_libraries)
for(const auto& itr : linked_libs)
verbprintf(0, "\t%s\n", itr.c_str());
verbprintf(0, "\n");
@@ -266,6 +266,20 @@ configure_settings(bool _init)
"Verbosity within the omnitrace-dl library", 0,
"debugging", "libomnitrace-dl", "advanced");
OMNITRACE_CONFIG_SETTING(
size_t, "OMNITRACE_NUM_THREADS_HINT",
"This is hint for how many threads are expected to be created in the "
"application. Setting this value allows omnitrace to preallocate resources "
"during initialization and warn about any potential issues. For example, when "
"call-stack sampling, each thread has a unique sampler instance which "
"communicates with an allocator instance running in a background thread. Each "
"allocator only handles N sampling instances (where N is the value of "
"OMNITRACE_SAMPLING_ALLOCATOR_SIZE). When this hint is set to >= the number of "
"threads that get sampled, omnitrace can start all the background threads during "
"initialization",
get_env<size_t>("OMNITRACE_NUM_THREADS", 1), "threading", "performance",
"sampling", "debugging", "advanced");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_PERFETTO", "Enable perfetto backend",
_default_perfetto_v, "backend", "perfetto");
@@ -484,6 +498,18 @@ configure_settings(bool _init)
"Create entries for inlined functions when available", false,
"sampling", "data", "advanced");
OMNITRACE_CONFIG_SETTING(
size_t, "OMNITRACE_SAMPLING_ALLOCATOR_SIZE",
"The number of sampled threads handled by an allocator running in a background "
"thread. Each thread that is sampled communicates with an allocator running in a "
"background thread which handles storing/caching the data when it's buffer is "
"full. Setting this value too high (i.e. equal to the number of threads when the "
"thread count is high) may cause loss of data -- the sampler may fill a new "
"buffer and overwrite old buffer data before the allocator can process it. "
"Setting this value to 1 will result in a background allocator thread for every "
"thread started by the application.",
8, "sampling", "debugging", "advanced");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_SAMPLING_REALTIME",
"Enable sampling frequency via a wall-clock timer on child threads. This may "
@@ -1599,6 +1625,13 @@ get_use_rcclp()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
size_t
get_num_threads_hint()
{
static auto _v = get_config()->find("OMNITRACE_NUM_THREADS_HINT");
return static_cast<tim::tsettings<size_t>&>(*_v->second).get();
}
bool
get_critical_trace_debug()
{
@@ -1868,6 +1901,13 @@ get_sampling_include_inlines()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
size_t
get_sampling_allocator_size()
{
static auto _v = get_config()->find("OMNITRACE_SAMPLING_ALLOCATOR_SIZE");
return std::max<size_t>(static_cast<tim::tsettings<size_t>&>(*_v->second).get(), 1);
}
int64_t
get_critical_trace_count()
{
@@ -302,6 +302,12 @@ get_sampling_real_tids();
bool
get_sampling_include_inlines();
size_t
get_num_threads_hint();
size_t
get_sampling_allocator_size();
double
get_process_sampling_freq();
@@ -30,6 +30,7 @@
#include "library/debug.hpp"
#include "library/ptl.hpp"
#include "library/runtime.hpp"
#include "library/state.hpp"
#include "library/thread_data.hpp"
#include "library/thread_info.hpp"
#include "library/tracing.hpp"
@@ -105,9 +106,70 @@ namespace sampling
{
namespace
{
using sampler_allocator_t = typename sampler_t::allocator_t;
auto&
get_sampler_allocators()
{
static auto _v = std::vector<std::shared_ptr<sampler_allocator_t>>{};
return _v;
}
std::set<int>
configure(bool _setup, int64_t _tid = threading::get_id());
void
configure_sampler_allocator(std::shared_ptr<sampler_allocator_t>& _v)
{
if(_v) return;
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
_v = std::make_shared<sampler_allocator_t>();
_v->reserve(config::get_sampling_allocator_size());
}
void
configure_sampler_allocators()
{
auto& _allocators = get_sampler_allocators();
if(_allocators.empty())
{
// avoid lock until necessary
auto_lock_t _alloc_lk{ type_mutex<decltype(_allocators)>() };
if(_allocators.empty())
{
_allocators.resize(std::ceil(config::get_num_threads_hint() /
config::get_sampling_allocator_size()));
for(auto& itr : _allocators)
configure_sampler_allocator(itr);
}
}
}
std::shared_ptr<sampler_allocator_t>
get_sampler_allocator()
{
configure_sampler_allocators();
auto& _allocators = get_sampler_allocators();
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
auto_lock_t _lk{ type_mutex<sampler_allocator_t>() };
for(auto& itr : _allocators)
{
if(!itr) configure_sampler_allocator(itr);
if(itr->size() < config::get_sampling_allocator_size()) return itr;
}
auto& _v = _allocators.emplace_back();
configure_sampler_allocator(_v);
return _v;
}
template <typename... Args>
void
thread_sigmask(Args... _args)
@@ -403,9 +465,12 @@ configure(bool _setup, int64_t _tid)
auto _verbose = std::min<int>(get_verbose() - 2, 2);
if(get_debug_sampling()) _verbose = 2;
OMNITRACE_DEBUG("Requesting allocator for sampler on thread %lu...\n", _tid);
auto _alloc = get_sampler_allocator();
OMNITRACE_DEBUG("Configuring sampler for thread %lu...\n", _tid);
sampling::sampler_instances::construct(construct_on_thread{ _tid }, "omnitrace",
_tid, _verbose);
sampling::sampler_instances::construct(construct_on_thread{ _tid }, _alloc,
"omnitrace", _tid, _verbose);
_sampler->set_flags(SA_RESTART);
_sampler->set_verbose(_verbose);
@@ -651,7 +716,7 @@ post_process()
"Getting sampler data for thread %lu...\n", i);
_sampler->stop();
auto& _raw_data = _sampler->get_data();
auto _raw_data = _sampler->get_data();
for(auto litr : _loaded_data[i])
{
while(!litr.is_empty())