Support sampling duration, sampling TIDs (#142)

- Sampling duration config values
  - OMNITRACE_SAMPLING_DURATION
  - OMNITRACE_PROCESS_SAMPLING_DURATION
  - Disables sampling after this time (in seconds) has elapsed 
- Sampling thread-id config values
  - OMNITRACE_SAMPLING_TIDS
  - OMNITRACE_SAMPLING_CPUTIME_TIDS
  - OMNITRACE_SAMPLING_REALTIME_TIDS
  - Allows user to select certain threads for sampling
- Miscellaneous
  - Tweaked the finalization verbosity messages
  - moved sampling-on-child-threads into runtime.hpp and runtime.cpp
  - fixed submodule dyninst header install

[ROCm/rocprofiler-systems commit: e67afd33eb]
This commit is contained in:
Jonathan R. Madsen
2022-08-31 06:29:19 -05:00
committed by GitHub
parent cbdc7cad4b
commit 2ef9dfd002
17 changed files with 390 additions and 155 deletions
@@ -314,8 +314,7 @@ if(OMNITRACE_BUILD_DYNINST)
TARGETS ${_LIB}
DESTINATION ${CMAKE_INSTALL_LIBDIR}/omnitrace
COMPONENT dyninst
PUBLIC_HEADER DESTINATION ${PROJECT_BINARY_DIR}/.discard/omnitrace/include
)
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/omnitrace/dyninst)
endif()
endforeach()
@@ -97,7 +97,7 @@
#define OMNITRACE_STRINGIZE(X) OMNITRACE_STRINGIZE2(X)
#define OMNITRACE_STRINGIZE2(X) #X
#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y
#define OMNITRACE_VARIABLE(Y) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, Y)
#define OMNITRACE_VARIABLE(X, Y) OMNITRACE_VAR_NAME_COMBINE(X, Y)
#define OMNITRACE_LINESTR OMNITRACE_STRINGIZE(__LINE__)
#define OMNITRACE_ESC(...) __VA_ARGS__
@@ -337,27 +337,26 @@ omnitrace_init_tooling_hidden()
if(get_state() > State::Active) return;
if(get_use_process_sampling())
{
pthread_gotcha::push_enable_sampling_on_child_threads(false);
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
process_sampler::setup();
pthread_gotcha::pop_enable_sampling_on_child_threads();
}
if(get_use_sampling())
{
pthread_gotcha::push_enable_sampling_on_child_threads(false);
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
sampling::setup();
pthread_gotcha::pop_enable_sampling_on_child_threads();
pthread_gotcha::push_enable_sampling_on_child_threads(get_use_sampling());
}
if(get_use_sampling())
{
push_enable_sampling_on_child_threads(get_use_sampling());
sampling::unblock_signals();
}
get_main_bundle()->start();
set_state(State::Active); // set to active as very last operation
} };
if(get_use_sampling())
{
pthread_gotcha::push_enable_sampling_on_child_threads(false);
sampling::block_signals();
}
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
if(get_use_sampling()) sampling::block_signals();
if(get_use_critical_trace())
{
@@ -426,8 +425,8 @@ omnitrace_init_tooling_hidden()
for(const auto& itr : _disabled_categories)
{
OMNITRACE_VERBOSE(1, "Disabling perfetto track event category: %s\n",
itr.c_str());
OMNITRACE_VERBOSE_F(1, "Disabling perfetto track event category: %s\n",
itr.c_str());
track_event_cfg.add_disabled_categories(itr);
}
@@ -581,6 +580,8 @@ omnitrace_finalize_hidden(void)
return;
}
if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n");
OMNITRACE_VERBOSE_F(0, "finalizing...\n");
thread_info::set_stop(comp::wall_clock::record());
@@ -604,8 +605,8 @@ omnitrace_finalize_hidden(void)
set_state(State::Finalized);
pthread_gotcha::push_enable_sampling_on_child_threads(false);
pthread_gotcha::set_sampling_on_all_future_threads(false);
push_enable_sampling_on_child_threads(false);
set_sampling_on_all_future_threads(false);
auto _debug_init = get_debug_finalize();
auto _debug_value = get_debug();
@@ -614,8 +615,6 @@ omnitrace_finalize_hidden(void)
if(_debug_init) config::set_setting_value("OMNITRACE_DEBUG", _debug_value);
} };
OMNITRACE_DEBUG_F("\n");
auto& _thread_bundle = thread_data<omnitrace_thread_bundle_t>::instance();
if(_thread_bundle) _thread_bundle->stop();
@@ -713,7 +712,7 @@ omnitrace_finalize_hidden(void)
comp::roctracer::shutdown();
// join extra thread(s) used by roctracer
OMNITRACE_VERBOSE_F(1, "Waiting on roctracer tasks...\n");
OMNITRACE_VERBOSE_F(2, "Waiting on roctracer tasks...\n");
tasking::join();
}
@@ -734,10 +733,11 @@ omnitrace_finalize_hidden(void)
// report the high-level metrics for the process
if(get_main_bundle())
{
if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n");
std::string _msg = JOIN("", *get_main_bundle());
auto _pos = _msg.find(">>> ");
if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5);
OMNITRACE_PRINT("%s\n", _msg.c_str());
OMNITRACE_VERBOSE_F(0, "%s\n", _msg.c_str());
OMNITRACE_DEBUG_F("Resetting main bundle...\n");
get_main_bundle()->reset();
}
@@ -754,10 +754,12 @@ omnitrace_finalize_hidden(void)
std::string _msg = JOIN("", *itr);
auto _pos = _msg.find(">>> ");
if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5);
OMNITRACE_VERBOSE(0, "%s\n", _msg.c_str());
OMNITRACE_VERBOSE_F(0, "%s\n", _msg.c_str());
}
}
if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n");
// ensure that all the MT instances are flushed
if(get_use_sampling())
{
@@ -813,6 +815,16 @@ omnitrace_finalize_hidden(void)
tasking::join();
}
// shutdown tasking before timemory is finalized, especially the roctracer thread-pool
OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n");
tasking::shutdown();
if(get_use_code_coverage())
{
OMNITRACE_VERBOSE_F(1, "Post-processing the code coverage...\n");
coverage::post_process();
}
bool _perfetto_output_error = false;
if(get_use_perfetto() && !is_system_backend())
{
@@ -821,11 +833,7 @@ omnitrace_finalize_hidden(void)
OMNITRACE_CI_THROW(tracing_session == nullptr,
"Null pointer to the tracing session");
if(get_verbose() >= 0) fprintf(stderr, "\n");
if(get_verbose() >= 0 || get_debug())
fprintf(stderr, "%s[%s][%s]|%i> Flushing perfetto...%s\n",
tim::log::color::info(), TIMEMORY_PROJECT_NAME, OMNITRACE_FUNCTION,
dmp::rank(), tim::log::color::end());
OMNITRACE_VERBOSE_F(0, "Finalizing perfetto...\n");
// Make sure the last event is closed for this example.
perfetto::TrackEvent::Flush();
@@ -905,16 +913,6 @@ omnitrace_finalize_hidden(void)
}
}
// shutdown tasking before timemory is finalized, especially the roctracer thread-pool
OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n");
tasking::shutdown();
OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n");
if(get_use_code_coverage())
{
coverage::post_process();
}
tim::manager::instance()->add_metadata([](auto& ar) {
auto _maps = tim::procfs::read_maps(process::get_id());
auto _libs = std::set<std::string>{};
@@ -22,7 +22,6 @@
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/category_region.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/roctracer.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
@@ -213,9 +212,8 @@ pthread_create_gotcha::wrapper::operator()() const
if(m_enable_sampling)
{
_is_sampling = true;
pthread_gotcha::push_enable_sampling_on_child_threads(false);
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
_signals = sampling::setup();
pthread_gotcha::pop_enable_sampling_on_child_threads();
sampling::unblock_signals();
}
}
@@ -336,7 +334,7 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
auto _active = (get_state() == ::omnitrace::State::Active && !_disabled);
auto _coverage = (get_mode() == Mode::Coverage);
auto _use_sampling = get_use_sampling();
auto _sample_child = pthread_gotcha::sampling_enabled_on_child_threads();
auto _sample_child = sampling_enabled_on_child_threads();
auto _tid = utility::get_thread_index();
auto _use_bundle = (_active && !_coverage);
const auto& _info = thread_info::init(!_active || !_sample_child || _disabled);
@@ -69,14 +69,6 @@ namespace
using bundle_t = tim::lightweight_tuple<component::pthread_create_gotcha_t,
component::pthread_mutex_gotcha_t>;
auto&
get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index())
{
static auto _v = utility::get_filled_array<OMNITRACE_MAX_THREADS>(
[]() { return utility::get_reserved_vector<bool>(32); });
return _v.at(_idx);
}
auto&
get_bundle()
{
@@ -112,51 +104,6 @@ pthread_gotcha::shutdown()
}
}
bool
pthread_gotcha::sampling_enabled_on_child_threads()
{
return sampling_on_child_threads();
}
bool
pthread_gotcha::push_enable_sampling_on_child_threads(bool _v)
{
bool _last = sampling_on_child_threads();
sampling_on_child_threads() = _v;
auto& _hist = get_sampling_on_child_threads_history();
_hist.emplace_back(_last);
return _last;
}
bool
pthread_gotcha::pop_enable_sampling_on_child_threads()
{
auto& _hist = get_sampling_on_child_threads_history();
if(!_hist.empty())
{
bool _restored = _hist.back();
_hist.pop_back();
sampling_on_child_threads() = _restored;
}
return sampling_on_child_threads();
}
void
pthread_gotcha::set_sampling_on_all_future_threads(bool _v)
{
for(size_t i = 0; i < max_supported_threads; ++i)
get_sampling_on_child_threads_history(i).emplace_back(_v);
}
bool&
pthread_gotcha::sampling_on_child_threads()
{
static thread_local bool _v = get_sampling_on_child_threads_history().empty()
? false
: get_sampling_on_child_threads_history().back();
return _v;
}
void
pthread_gotcha::start()
{
@@ -42,22 +42,7 @@ struct pthread_gotcha : tim::component::base<pthread_gotcha, void>
static void configure();
static void shutdown();
// query current value
static bool sampling_enabled_on_child_threads();
// use this to disable sampling in a region (e.g. right before thread creation)
static bool push_enable_sampling_on_child_threads(bool _v);
// use this to restore previous setting
static bool pop_enable_sampling_on_child_threads();
// make sure every newly created thead starts with this value
static void set_sampling_on_all_future_threads(bool _v);
static void start();
static void stop();
private:
static bool& sampling_on_child_threads();
};
} // namespace omnitrace
@@ -22,7 +22,6 @@
#include "library/components/pthread_mutex_gotcha.hpp"
#include "library/components/category_region.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/critical_trace.hpp"
#include "library/debug.hpp"
@@ -293,7 +292,7 @@ pthread_mutex_gotcha::is_disabled()
{
return (get_state() != ::omnitrace::State::Active ||
get_thread_state() != ThreadState::Enabled ||
(get_use_sampling() && !pthread_gotcha::sampling_enabled_on_child_threads()));
(get_use_sampling() && !sampling_enabled_on_child_threads()));
}
} // namespace component
} // namespace omnitrace
@@ -22,13 +22,13 @@
#include "library/components/roctracer.hpp"
#include "library/common.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/defines.hpp"
#include "library/dynamic_library.hpp"
#include "library/redirect.hpp"
#include "library/roctracer.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
@@ -121,7 +121,7 @@ roctracer::setup()
roctracer_is_setup() = true;
OMNITRACE_VERBOSE_F(1, "setting up roctracer...\n");
pthread_gotcha::push_enable_sampling_on_child_threads(false);
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
dynamic_library _amdhip64{ "OMNITRACE_ROCTRACER_LIBAMDHIP64",
find_library_path("libamdhip64.so",
@@ -169,8 +169,6 @@ roctracer::setup()
for(auto& itr : roctracer_setup_routines())
itr.second();
pthread_gotcha::pop_enable_sampling_on_child_threads();
OMNITRACE_VERBOSE_F(1, "roctracer is setup\n");
}
@@ -92,6 +92,42 @@ get_available_perfetto_categories()
return _v;
}
template <typename Tp = int64_t>
std::set<Tp>
parse_numeric_range(std::string _input_string, const std::string& _label)
{
for(auto& itr : _input_string)
itr = tolower(itr);
auto _result = std::set<Tp>{};
for(const auto& _v : tim::delimit(_input_string, ",; \t"))
{
if(_v.find_first_not_of("0123456789-") != std::string::npos)
{
OMNITRACE_VERBOSE_F(
0,
"Invalid %s specification. Only numerical values (e.g., 0) or "
"ranges (e.g., 0-7) are permitted. Ignoring %s...",
_label.c_str(), _v.c_str());
continue;
}
if(_v.find('-') != std::string::npos)
{
auto _vv = tim::delimit(_v, "-");
OMNITRACE_CONDITIONAL_THROW(
_vv.size() != 2,
"Invalid %s range specification: %s. Required format N-M, e.g. 0-4",
_label.c_str(), _v.c_str());
for(int64_t i = std::stol(_vv.at(0)); i <= std::stol(_vv.at(1)); ++i)
_result.emplace(i);
}
else
{
_result.emplace(std::stol(_v));
}
}
return _result;
}
#define OMNITRACE_CONFIG_SETTING(TYPE, ENV_NAME, DESCRIPTION, INITIAL_VALUE, ...) \
[&]() { \
auto _ret = _config->insert<TYPE, TYPE>( \
@@ -334,12 +370,21 @@ configure_settings(bool _init)
"delivered. Defaults to OMNITRACE_SAMPLING_DELAY when <= 0.0",
-1.0, "sampling", "advanced");
OMNITRACE_CONFIG_SETTING(double, "OMNITRACE_SAMPLING_DURATION",
"If > 0.0, time (in seconds) to sample before stopping", 0.0,
"sampling", "process_sampling");
OMNITRACE_CONFIG_SETTING(
double, "OMNITRACE_PROCESS_SAMPLING_FREQ",
"Number of measurements per second when OMNITTRACE_USE_PROCESS_SAMPLING=ON. If "
"set to zero, uses OMNITRACE_SAMPLING_FREQ value",
0.0, "process_sampling");
OMNITRACE_CONFIG_SETTING(double, "OMNITRACE_PROCESS_SAMPLING_DURATION",
"If > 0.0, time (in seconds) to sample before stopping. If "
"less than zero, uses OMNITRACE_SAMPLING_DURATION",
-1.0, "sampling", "process_sampling");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_SAMPLING_CPUS",
"CPUs to collect frequency information for. Values should be separated by commas "
@@ -359,6 +404,29 @@ configure_settings(bool _init)
"'all' and 'none' suppresses all GPU sampling",
std::string{ "all" }, "rocm_smi", "rocm", "process_sampling");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_SAMPLING_TIDS",
"Limit call-stack sampling to specific thread IDs, starting at zero for the main "
"thread. Be aware that some libraries, such as ROCm may create additional "
"threads which increment the TID count. However, no threads started by omnitrace "
"will increment the TID count. Values should be separated by commas and can be "
"explicit or ranges, e.g. 0,1,5-8. An empty value implies all TIDs.",
std::string{}, "sampling", "advanced");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_SAMPLING_CPUTIME_TIDS",
"Same as OMNITRACE_SAMPLING_TIDS but applies specifically to samplers whose "
"timers are based on the CPU-time. This is useful when both "
"OMNITRACE_SAMPLING_CPUTIME=ON and OMNITRACE_SAMPLING_REALTIME=ON",
std::string{}, "sampling", "advanced");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_SAMPLING_REALTIME_TIDS",
"Same as OMNITRACE_SAMPLING_TIDS but applies specifically to samplers whose "
"timers are based on the real (wall) time. This is useful when both "
"OMNITRACE_SAMPLING_CPUTIME=ON and OMNITRACE_SAMPLING_REALTIME=ON",
std::string{}, "sampling", "advanced");
auto _backend = tim::get_env_choice<std::string>(
"OMNITRACE_PERFETTO_BACKEND",
(_system_backend) ? "system" // if OMNITRACE_PERFETTO_BACKEND_SYSTEM is true,
@@ -480,7 +548,7 @@ configure_settings(bool _init)
OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_PERFETTO_BUFFER_SIZE_KB",
"Size of perfetto buffer (in KB)", size_t{ 1024000 },
"perfetto", "data", "advanced");
"perfetto", "data");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_PERFETTO_COMBINE_TRACES",
"Combine Perfetto traces. If not explicitly set, it will "
@@ -695,9 +763,13 @@ configure_settings(bool _init)
tim::delimit(_config->get<std::string>("OMNITRACE_CONFIG_FILE"), ";:"))
{
if(_config->get_suppress_config()) continue;
OMNITRACE_BASIC_VERBOSE(1, "Reading config file %s\n", itr.c_str());
_config->read(itr);
if(_config->get<bool>("OMNITRACE_CI") && _main_proc)
if(_main_proc &&
((_config->get<bool>("OMNITRACE_CI") && settings::verbose() >= 0) ||
settings::verbose() >= 1 || settings::debug()))
{
std::ifstream _in{ itr };
std::stringstream _iss{};
@@ -709,7 +781,7 @@ configure_settings(bool _init)
}
if(!_iss.str().empty())
{
OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(),
OMNITRACE_BASIC_PRINT("config file '%s':\n%s", itr.c_str(),
_iss.str().c_str());
}
}
@@ -1753,6 +1825,13 @@ get_sampling_real_delay()
return _val;
}
double
get_sampling_duration()
{
static auto _v = get_config()->find("OMNITRACE_SAMPLING_DURATION");
return static_cast<tim::tsettings<double>&>(*_v->second).get();
}
std::string
get_sampling_cpus()
{
@@ -1760,6 +1839,30 @@ get_sampling_cpus()
return static_cast<tim::tsettings<std::string>&>(*_v->second).get();
}
std::set<int64_t>
get_sampling_tids()
{
static auto _v = get_config()->find("OMNITRACE_SAMPLING_TIDS");
return parse_numeric_range<>(
static_cast<tim::tsettings<std::string>&>(*_v->second).get(), "thread IDs");
}
std::set<int64_t>
get_sampling_cpu_tids()
{
static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME_TIDS");
return parse_numeric_range<>(
static_cast<tim::tsettings<std::string>&>(*_v->second).get(), "thread IDs");
}
std::set<int64_t>
get_sampling_real_tids()
{
static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_TIDS");
return parse_numeric_range<>(
static_cast<tim::tsettings<std::string>&>(*_v->second).get(), "thread IDs");
}
int64_t
get_critical_trace_count()
{
@@ -1777,6 +1880,13 @@ get_process_sampling_freq()
return _val;
}
double
get_process_sampling_duration()
{
static auto _v = get_config()->find("OMNITRACE_PROCESS_SAMPLING_DURATION");
return static_cast<tim::tsettings<double>&>(*_v->second).get();
}
std::string
get_sampling_gpus()
{
@@ -298,12 +298,24 @@ get_sampling_cpu_delay();
double
get_sampling_real_delay();
double
get_sampling_duration();
std::string
get_sampling_cpus();
std::set<int64_t>
get_sampling_cpu_tids();
std::set<int64_t>
get_sampling_real_tids();
double
get_process_sampling_freq();
double
get_process_sampling_duration();
std::string
get_sampling_gpus();
@@ -160,8 +160,9 @@ write_perfetto_counter_track(index&& _idx, Args... _args)
void
post_process()
{
OMNITRACE_PRINT("Post-processing %zu cpu frequency and memory usage entries...\n",
cpu_data.size());
OMNITRACE_VERBOSE(1,
"Post-processing %zu cpu frequency and memory usage entries...\n",
cpu_data.size());
auto _process_frequencies = [](size_t _idx, size_t _offset) {
using freq_track = perfetto_counter_track<cpu_freq_component>;
@@ -21,7 +21,6 @@
// SOFTWARE.
#include "library/process_sampler.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/cpu_freq.hpp"
#include "library/debug.hpp"
@@ -86,10 +85,16 @@ sampler::poll(std::atomic<State>* _state, nsec_t _interval, promise_t* _ready)
itr->config();
OMNITRACE_VERBOSE(
1, "Thread sampler polling at an interval of %f seconds...\n",
1, "Background process sampling polling at an interval of %f seconds...\n",
std::chrono::duration_cast<std::chrono::duration<double>>(_interval).count());
auto _duration = config::get_process_sampling_duration();
if(_duration < 0.0) _duration = config::get_sampling_duration();
bool _has_duration = (_duration > 0.0);
auto _now = std::chrono::steady_clock::now();
auto _end =
_now + std::chrono::nanoseconds{ static_cast<uint64_t>(_duration * units::sec) };
while(_state && _state->load() != State::Finalized && get_state() != State::Finalized)
{
std::this_thread::sleep_until(_now);
@@ -100,12 +105,23 @@ sampler::poll(std::atomic<State>* _state, nsec_t _interval, promise_t* _ready)
for(auto& itr : instances)
itr->sample();
get_sampler_is_sampling().store(false);
if(_has_duration && _now >= _end) break;
while(_now < std::chrono::steady_clock::now())
_now += _interval;
}
// ensure this is always false
get_sampler_is_sampling().store(false);
if(_has_duration && _now >= _end && get_state() != State::Finalized)
{
OMNITRACE_VERBOSE(
1,
"Background process sampling duration of %f seconds has elapsed. "
"Shutting down process sampling...\n",
_duration);
}
OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(),
"Thread sampler polling completed...\n");
@@ -155,12 +171,12 @@ sampler::setup()
auto _fut = _prom.get_future();
polling_finished = std::make_unique<promise_t>();
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
set_state(State::PreInit);
pthread_gotcha::push_enable_sampling_on_child_threads(false);
get_thread() = std::make_unique<std::thread>(&poll<msec_t>, &get_sampler_state(),
msec_t{ _msec_freq }, &_prom);
_fut.wait();
pthread_gotcha::pop_enable_sampling_on_child_threads();
set_state(State::Active);
}
@@ -167,7 +167,7 @@ extern "C"
if(!tim::settings::enabled()) return true;
roctracer_is_init() = true;
pthread_gotcha::push_enable_sampling_on_child_threads(false);
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
OMNITRACE_BASIC_VERBOSE_F(1 || rocm::on_load_trace, "Loading ROCm tooling...\n");
tim::consume_parameters(table, runtime_version, failed_tool_count,
@@ -308,7 +308,6 @@ extern "C"
"failed! OMNITRACE_ROCPROFILER_LIBRARY=%s\n",
_rocprof.filename.c_str());
}
pthread_gotcha::pop_enable_sampling_on_child_threads();
OMNITRACE_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading... %s\n",
(_success) ? "Done" : "Failed");
@@ -33,12 +33,12 @@
#include "library/rocm_smi.hpp"
#include "library/common.hpp"
#include "library/components/fwd.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/critical_trace.hpp"
#include "library/debug.hpp"
#include "library/gpu.hpp"
#include "library/perfetto.hpp"
#include "library/runtime.hpp"
#include "library/state.hpp"
#include "library/thread_info.hpp"
@@ -328,7 +328,7 @@ setup()
if(is_initialized() || !get_use_rocm_smi()) return;
pthread_gotcha::push_enable_sampling_on_child_threads(false);
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
// assign the data value to determined by rocm-smi
data::device_count = device_count();
@@ -402,8 +402,6 @@ setup()
_e.what());
data::device_list = {};
}
pthread_gotcha::pop_enable_sampling_on_child_threads();
}
void
@@ -52,6 +52,26 @@
namespace omnitrace
{
namespace
{
auto&
get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index())
{
static auto _v = utility::get_filled_array<OMNITRACE_MAX_THREADS>(
[]() { return utility::get_reserved_vector<bool>(32); });
return _v.at(_idx);
}
bool&
sampling_on_child_threads()
{
static thread_local bool _v = get_sampling_on_child_threads_history().empty()
? false
: get_sampling_on_child_threads_history().back();
return _v;
}
} // namespace
int
get_realtime_signal()
{
@@ -254,4 +274,40 @@ pop_thread_state()
}
return get_thread_state();
}
bool
sampling_enabled_on_child_threads()
{
return sampling_on_child_threads();
}
bool
push_enable_sampling_on_child_threads(bool _v)
{
bool _last = sampling_on_child_threads();
sampling_on_child_threads() = _v;
auto& _hist = get_sampling_on_child_threads_history();
_hist.emplace_back(_last);
return _last;
}
bool
pop_enable_sampling_on_child_threads()
{
auto& _hist = get_sampling_on_child_threads_history();
if(!_hist.empty())
{
bool _restored = _hist.back();
_hist.pop_back();
sampling_on_child_threads() = _restored;
}
return sampling_on_child_threads();
}
void
set_sampling_on_all_future_threads(bool _v)
{
for(size_t i = 0; i < max_supported_threads; ++i)
get_sampling_on_child_threads_history(i).emplace_back(_v);
}
} // namespace omnitrace
@@ -117,11 +117,39 @@ struct scoped_thread_state
scoped_thread_state(ThreadState _v) { push_thread_state(_v); }
~scoped_thread_state() { pop_thread_state(); }
};
// query current value
bool
sampling_enabled_on_child_threads();
// use this to disable sampling in a region (e.g. right before thread creation)
bool
push_enable_sampling_on_child_threads(bool _v);
// use this to restore previous setting
bool
pop_enable_sampling_on_child_threads();
// make sure every newly created thead starts with this value
void
set_sampling_on_all_future_threads(bool _v);
struct scoped_child_sampling
{
scoped_child_sampling(bool _v) { push_enable_sampling_on_child_threads(_v); }
~scoped_child_sampling() { pop_enable_sampling_on_child_threads(); }
};
} // namespace omnitrace
#define OMNITRACE_SCOPED_THREAD_STATE(STATE) \
::omnitrace::scoped_thread_state OMNITRACE_VARIABLE( \
OMNITRACE_VAR_NAME_COMBINE(scoped_thread_state_, __LINE__)) \
::omnitrace::scoped_thread_state OMNITRACE_VARIABLE(_scoped_thread_state_, __LINE__) \
{ \
::omnitrace::STATE \
}
#define OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(VALUE) \
::omnitrace::scoped_child_sampling OMNITRACE_VARIABLE(_scoped_child_sampling_, \
__LINE__) \
{ \
VALUE \
}
@@ -26,7 +26,6 @@
#include "library/components/backtrace_metrics.hpp"
#include "library/components/backtrace_timestamp.hpp"
#include "library/components/fwd.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/ptl.hpp"
@@ -60,6 +59,8 @@
#include <timemory/variadic.hpp>
#include <array>
#include <chrono>
#include <condition_variable>
#include <cstring>
#include <ctime>
#include <initializer_list>
@@ -161,6 +162,79 @@ get_sampler_running(int64_t _tid)
return _v.at(_tid);
}
auto&
get_duration_cv()
{
static auto _v = std::condition_variable{};
return _v;
}
auto&
get_duration_thread()
{
static auto _v = std::unique_ptr<std::thread>{};
return _v;
}
void
start_duration_thread()
{
static std::mutex _start_mutex{};
std::unique_lock<std::mutex> _start_lk{ _start_mutex, std::defer_lock };
if(!_start_lk.owns_lock()) _start_lk.lock();
if(!get_duration_thread() && config::get_sampling_duration() > 0.0)
{
// we may need to protect against recursion bc of pthread wrapper
static bool _protect = false;
if(_protect) return;
_protect = true;
auto _now = std::chrono::steady_clock::now();
auto _end = _now + std::chrono::nanoseconds{ static_cast<uint64_t>(
config::get_sampling_duration() * units::sec) };
auto _func = [_end]() {
thread_info::init(true);
std::mutex _mutex{};
bool _wait = true;
while(_wait)
{
_wait = false;
std::unique_lock<std::mutex> _lk{ _mutex };
get_duration_cv().wait_until(_lk, _end);
auto _premature = (std::chrono::steady_clock::now() < _end);
auto _finalized = (get_state() == State::Finalized);
if(_premature && !_finalized)
{
// protect against spurious wakeups
OMNITRACE_VERBOSE(
2, "%sSpurious wakeup of sampling duration thread...\n",
tim::log::color::warning());
_wait = true;
}
else if(_finalized)
{
break;
}
else
{
OMNITRACE_VERBOSE(1,
"Sampling duration of %f seconds has elapsed. "
"Shutting down sampling...\n",
config::get_sampling_duration());
shutdown();
}
}
};
OMNITRACE_VERBOSE(1, "Sampling will be disabled after %f seconds...\n",
config::get_sampling_duration());
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
get_duration_thread() = std::make_unique<std::thread>(_func);
_protect = false;
}
}
std::set<int>
configure(bool _setup, int64_t _tid = threading::get_id())
{
@@ -170,10 +244,24 @@ configure(bool _setup, int64_t _tid = threading::get_id())
bool _is_running = (!_running) ? false : *_running;
auto& _signal_types = sampling::get_signal_types(_tid);
pthread_gotcha::push_enable_sampling_on_child_threads(false);
auto _dtor = scope::destructor{ []() {
pthread_gotcha::pop_enable_sampling_on_child_threads();
} };
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
auto&& _cpu_tids = get_sampling_cpu_tids();
auto&& _real_tids = get_sampling_real_tids();
auto _erase_tid_signal = [_tid, &_signal_types](auto& _tids, int _signum) {
if(!_tids.empty())
{
if(_tids.count(_tid) == 0)
{
OMNITRACE_VERBOSE(3, "Disabling SIG%i from thread %li\n", _signum, _tid);
_signal_types->erase(_signum);
}
}
};
_erase_tid_signal(_cpu_tids, get_cputime_signal());
_erase_tid_signal(_real_tids, get_realtime_signal());
if(_setup && !_sampler && !_is_running && !_signal_types->empty())
{
@@ -253,6 +341,7 @@ configure(bool _setup, int64_t _tid = threading::get_id())
*_running = true;
sampling::get_sampler_init(_tid)->sample();
start_duration_thread();
_sampler->start();
}
else if(!_setup && _sampler && _is_running)
@@ -265,6 +354,7 @@ configure(bool _setup, int64_t _tid = threading::get_id())
sampling::block_signals(*_signal_types);
}
get_duration_cv().notify_one();
if(_tid == 0)
{
// this propagates to all threads
@@ -278,6 +368,12 @@ configure(bool _setup, int64_t _tid = threading::get_id())
*get_sampler_running(i) = false;
}
}
if(get_duration_thread())
{
get_duration_thread()->join();
get_duration_thread().reset();
}
}
_sampler->stop();
@@ -363,8 +459,8 @@ post_process()
for(size_t i = 0; i < max_supported_threads; ++i)
backtrace_metrics::configure(false, i);
OMNITRACE_VERBOSE(1 || get_debug_sampling(), "Post-processing sampling data...\n");
size_t _total_data = 0;
size_t _total_threads = 0;
for(size_t i = 0; i < max_supported_threads; ++i)
{
auto& _sampler = get_sampler(i);
@@ -398,7 +494,7 @@ post_process()
_sampler->stop();
auto& _raw_data = _sampler->get_data();
OMNITRACE_VERBOSE(0 || get_debug_sampling(),
OMNITRACE_VERBOSE(2 || get_debug_sampling(),
"Sampler data for thread %lu has %zu initial entries...\n", i,
_raw_data.size());
@@ -430,23 +526,27 @@ post_process()
continue;
}
OMNITRACE_VERBOSE(0 || get_debug_sampling(),
OMNITRACE_VERBOSE(2 || get_debug_sampling(),
"Sampler data for thread %lu has %zu valid entries...\n", i,
_raw_data.size());
_total_data += _raw_data.size();
_total_threads += 1;
if(get_use_perfetto()) post_process_perfetto(i, _init, _data);
if(get_use_timemory()) post_process_timemory(i, _init, _data);
}
OMNITRACE_VERBOSE(0 || get_debug_sampling(),
"Post-processing sampling entries completed\n");
OMNITRACE_VERBOSE(3 || get_debug_sampling(), "Destroying samplers...\n");
for(size_t i = 0; i < max_supported_threads; ++i)
{
get_sampler(i).reset();
}
OMNITRACE_VERBOSE(0 || get_debug_sampling(), "Post-processing samplers destroyed\n");
OMNITRACE_VERBOSE(1 || get_debug_sampling(),
"Collected %zu samples from %zu threads...\n", _total_data,
_total_threads);
}
namespace
@@ -535,17 +635,8 @@ post_process_perfetto(int64_t _tid, const bundle_t* _init,
}
};
if(_tid == 0 && config::get_mode() == Mode::Sampling &&
config::get_perfetto_fill_policy() == "discard")
{
_process_perfetto(_data);
}
else
{
pthread_gotcha::push_enable_sampling_on_child_threads(false);
std::thread{ _process_perfetto_wrapper }.join();
pthread_gotcha::pop_enable_sampling_on_child_threads();
}
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
std::thread{ _process_perfetto_wrapper }.join();
}
void