Improve sampling allocator (#205)
* Updated sampling
- dynamic sampler is constructed with a shared pointer to an allocator instance
- dynamic allocator handles multiple sampler
- eliminates need for every per-thread dynamic sampler to start background allocator thread
* Fix usage of tim::popen
[ROCm/rocprofiler-systems commit: 2135f82ab8]
This commit is contained in:
committed by
GitHub
orang tua
84c14233b9
melakukan
a325f26c61
+1
-1
Submodule projects/rocprofiler-systems/external/timemory updated: 040456175a...a1538e4352
@@ -2197,18 +2197,18 @@ main(int argc, char** argv)
|
||||
verbprintf(0, "Consider instrumenting the relevant libraries...\n");
|
||||
verbprintf(0, "\n");
|
||||
|
||||
using TIMEMORY_PIPE = tim::popen::TIMEMORY_PIPE;
|
||||
auto cmdv_envp = std::array<char*, 2>{};
|
||||
cmdv_envp.fill(nullptr);
|
||||
cmdv_envp.at(0) = strdup("LD_TRACE_LOADED_OBJECTS=1");
|
||||
auto ldd = tim::popen::popen(cmdv0.c_str(), nullptr, cmdv_envp.data());
|
||||
auto linked_libs = tim::popen::read_ldd_fork(ldd);
|
||||
auto perr = tim::popen::pclose(ldd);
|
||||
for(auto& itr : cmdv_envp)
|
||||
::free(itr);
|
||||
|
||||
tim::set_env("LD_TRACE_LOADED_OBJECTS", "1", 1);
|
||||
TIMEMORY_PIPE* ldd = tim::popen::popen(cmdv0.c_str());
|
||||
tim::set_env("LD_TRACE_LOADED_OBJECTS", "0", 1);
|
||||
|
||||
strvec_t linked_libraries = tim::popen::read_ldd_fork(ldd);
|
||||
|
||||
auto perr = tim::popen::pclose(ldd);
|
||||
if(perr != 0) perror("Error in omnitrace_fork");
|
||||
|
||||
for(const auto& itr : linked_libraries)
|
||||
for(const auto& itr : linked_libs)
|
||||
verbprintf(0, "\t%s\n", itr.c_str());
|
||||
|
||||
verbprintf(0, "\n");
|
||||
|
||||
@@ -266,6 +266,20 @@ configure_settings(bool _init)
|
||||
"Verbosity within the omnitrace-dl library", 0,
|
||||
"debugging", "libomnitrace-dl", "advanced");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
size_t, "OMNITRACE_NUM_THREADS_HINT",
|
||||
"This is hint for how many threads are expected to be created in the "
|
||||
"application. Setting this value allows omnitrace to preallocate resources "
|
||||
"during initialization and warn about any potential issues. For example, when "
|
||||
"call-stack sampling, each thread has a unique sampler instance which "
|
||||
"communicates with an allocator instance running in a background thread. Each "
|
||||
"allocator only handles N sampling instances (where N is the value of "
|
||||
"OMNITRACE_SAMPLING_ALLOCATOR_SIZE). When this hint is set to >= the number of "
|
||||
"threads that get sampled, omnitrace can start all the background threads during "
|
||||
"initialization",
|
||||
get_env<size_t>("OMNITRACE_NUM_THREADS", 1), "threading", "performance",
|
||||
"sampling", "debugging", "advanced");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_PERFETTO", "Enable perfetto backend",
|
||||
_default_perfetto_v, "backend", "perfetto");
|
||||
|
||||
@@ -484,6 +498,18 @@ configure_settings(bool _init)
|
||||
"Create entries for inlined functions when available", false,
|
||||
"sampling", "data", "advanced");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
size_t, "OMNITRACE_SAMPLING_ALLOCATOR_SIZE",
|
||||
"The number of sampled threads handled by an allocator running in a background "
|
||||
"thread. Each thread that is sampled communicates with an allocator running in a "
|
||||
"background thread which handles storing/caching the data when it's buffer is "
|
||||
"full. Setting this value too high (i.e. equal to the number of threads when the "
|
||||
"thread count is high) may cause loss of data -- the sampler may fill a new "
|
||||
"buffer and overwrite old buffer data before the allocator can process it. "
|
||||
"Setting this value to 1 will result in a background allocator thread for every "
|
||||
"thread started by the application.",
|
||||
8, "sampling", "debugging", "advanced");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_SAMPLING_REALTIME",
|
||||
"Enable sampling frequency via a wall-clock timer on child threads. This may "
|
||||
@@ -1599,6 +1625,13 @@ get_use_rcclp()
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
size_t
|
||||
get_num_threads_hint()
|
||||
{
|
||||
static auto _v = get_config()->find("OMNITRACE_NUM_THREADS_HINT");
|
||||
return static_cast<tim::tsettings<size_t>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_critical_trace_debug()
|
||||
{
|
||||
@@ -1868,6 +1901,13 @@ get_sampling_include_inlines()
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
size_t
|
||||
get_sampling_allocator_size()
|
||||
{
|
||||
static auto _v = get_config()->find("OMNITRACE_SAMPLING_ALLOCATOR_SIZE");
|
||||
return std::max<size_t>(static_cast<tim::tsettings<size_t>&>(*_v->second).get(), 1);
|
||||
}
|
||||
|
||||
int64_t
|
||||
get_critical_trace_count()
|
||||
{
|
||||
|
||||
@@ -302,6 +302,12 @@ get_sampling_real_tids();
|
||||
bool
|
||||
get_sampling_include_inlines();
|
||||
|
||||
size_t
|
||||
get_num_threads_hint();
|
||||
|
||||
size_t
|
||||
get_sampling_allocator_size();
|
||||
|
||||
double
|
||||
get_process_sampling_freq();
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "library/debug.hpp"
|
||||
#include "library/ptl.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/state.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/thread_info.hpp"
|
||||
#include "library/tracing.hpp"
|
||||
@@ -105,9 +106,70 @@ namespace sampling
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using sampler_allocator_t = typename sampler_t::allocator_t;
|
||||
|
||||
auto&
|
||||
get_sampler_allocators()
|
||||
{
|
||||
static auto _v = std::vector<std::shared_ptr<sampler_allocator_t>>{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::set<int>
|
||||
configure(bool _setup, int64_t _tid = threading::get_id());
|
||||
|
||||
void
|
||||
configure_sampler_allocator(std::shared_ptr<sampler_allocator_t>& _v)
|
||||
{
|
||||
if(_v) return;
|
||||
|
||||
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
_v = std::make_shared<sampler_allocator_t>();
|
||||
_v->reserve(config::get_sampling_allocator_size());
|
||||
}
|
||||
|
||||
void
|
||||
configure_sampler_allocators()
|
||||
{
|
||||
auto& _allocators = get_sampler_allocators();
|
||||
if(_allocators.empty())
|
||||
{
|
||||
// avoid lock until necessary
|
||||
auto_lock_t _alloc_lk{ type_mutex<decltype(_allocators)>() };
|
||||
if(_allocators.empty())
|
||||
{
|
||||
_allocators.resize(std::ceil(config::get_num_threads_hint() /
|
||||
config::get_sampling_allocator_size()));
|
||||
for(auto& itr : _allocators)
|
||||
configure_sampler_allocator(itr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<sampler_allocator_t>
|
||||
get_sampler_allocator()
|
||||
{
|
||||
configure_sampler_allocators();
|
||||
|
||||
auto& _allocators = get_sampler_allocators();
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
auto_lock_t _lk{ type_mutex<sampler_allocator_t>() };
|
||||
|
||||
for(auto& itr : _allocators)
|
||||
{
|
||||
if(!itr) configure_sampler_allocator(itr);
|
||||
if(itr->size() < config::get_sampling_allocator_size()) return itr;
|
||||
}
|
||||
|
||||
auto& _v = _allocators.emplace_back();
|
||||
configure_sampler_allocator(_v);
|
||||
return _v;
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void
|
||||
thread_sigmask(Args... _args)
|
||||
@@ -403,9 +465,12 @@ configure(bool _setup, int64_t _tid)
|
||||
auto _verbose = std::min<int>(get_verbose() - 2, 2);
|
||||
if(get_debug_sampling()) _verbose = 2;
|
||||
|
||||
OMNITRACE_DEBUG("Requesting allocator for sampler on thread %lu...\n", _tid);
|
||||
auto _alloc = get_sampler_allocator();
|
||||
|
||||
OMNITRACE_DEBUG("Configuring sampler for thread %lu...\n", _tid);
|
||||
sampling::sampler_instances::construct(construct_on_thread{ _tid }, "omnitrace",
|
||||
_tid, _verbose);
|
||||
sampling::sampler_instances::construct(construct_on_thread{ _tid }, _alloc,
|
||||
"omnitrace", _tid, _verbose);
|
||||
|
||||
_sampler->set_flags(SA_RESTART);
|
||||
_sampler->set_verbose(_verbose);
|
||||
@@ -651,7 +716,7 @@ post_process()
|
||||
"Getting sampler data for thread %lu...\n", i);
|
||||
|
||||
_sampler->stop();
|
||||
auto& _raw_data = _sampler->get_data();
|
||||
auto _raw_data = _sampler->get_data();
|
||||
for(auto litr : _loaded_data[i])
|
||||
{
|
||||
while(!litr.is_empty())
|
||||
|
||||
Reference in New Issue
Block a user