Trace thread config + paranoid level + preload (#176)

- OMNITRACE_TRACE_THREAD_BARRIERS config option
  - set to OFF to disable wrapping `pthread_barrier`
- OMNITRACE_TRACE_THREAD_JOIN config option
  - set to OFF to disable wrapping `pthread_join`
- allow PAPI with perf_event_paranoid at level 2
- default to no PAPI events
- setenv LD_PRELOAD to not include libomnitrace after preload
  - closes #175 
- bump version to 1.7.1

[ROCm/rocprofiler-systems commit: a3439d5bf2]
This commit is contained in:
Jonathan R. Madsen
2022-10-06 19:11:08 -05:00
committato da GitHub
parent 45e5450bf2
commit 0ec0d18ac8
7 ha cambiato i file con 94 aggiunte e 21 eliminazioni
+1 -1
Vedi File
@@ -1 +1 @@
1.7.0
1.7.1
Submodule projects/rocprofiler-systems/external/timemory updated: 46f25fbb46...ddc49db2ec
@@ -96,13 +96,17 @@ variable to be enabled (i.e., `OMNITRACE_USE_ROCPROFILER=ON`).
Example configuration for hardware counters:
```console
# using papi identifiers
OMNITRACE_PAPI_EVENTS = PAPI_TOT_CYC PAPI_TOT_INS
# using perf identifiers
OMNITRACE_PAPI_EVENTS = perf::INSTRUCTIONS perf::CACHE-REFERENCES perf::CACHE-MISSES
```
#### OMNITRACE_PAPI_EVENTS
In order to collect the majority of hardware counters via PAPI, you need to make sure the `/proc/sys/kernel/perf_event_paranoid`
has a value of less than 2. If you have sudo access, you can use the following command to modify the value:
has a value <= 2. If you have sudo access, you can use the following command to modify the value:
```shell
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
@@ -98,9 +98,32 @@ get_omnitrace_dl_env()
inline bool
get_omnitrace_preload()
{
auto&& _preload = get_env("OMNITRACE_PRELOAD", true);
static bool _v = []() {
auto&& _preload = get_env("OMNITRACE_PRELOAD", true);
auto&& _preload_libs = get_env("LD_PRELOAD", std::string{});
return (_preload &&
_preload_libs.find("libomnitrace-dl.so") != std::string::npos);
}();
return _v;
}
inline void
reset_omnitrace_preload()
{
auto&& _preload_libs = get_env("LD_PRELOAD", std::string{});
return (_preload && _preload_libs.find("libomnitrace-dl.so") != std::string::npos);
if(_preload_libs.find("libomnitrace-dl.so") != std::string::npos)
{
auto _modified_preload = std::string{};
for(const auto& itr : delimit(_preload_libs, ":"))
{
if(itr.find("libomnitrace") != std::string::npos) continue;
_modified_preload += common::join("", ":", itr);
}
if(!_modified_preload.empty() && _modified_preload.find(':') == 0)
_modified_preload = _modified_preload.substr(1);
setenv("LD_PRELOAD", _modified_preload.c_str(), 1);
}
}
// environment priority:
@@ -940,6 +963,7 @@ omnitrace_preload()
if(_preload)
{
reset_omnitrace_preload();
omnitrace_preinit_library();
OMNITRACE_DL_LOG(1, "[%s] invoking %s(%s)\n", __FUNCTION__, "omnitrace_init",
::omnitrace::join(::omnitrace::QuoteStrings{}, ", ", "sampling",
@@ -71,6 +71,8 @@ pthread_mutex_gotcha::get_hashes()
for(size_t i = 9; i < 12; ++i)
_skip.emplace(i);
}
if(!config::get_trace_thread_barriers()) _skip.emplace(8);
if(!config::get_trace_thread_join()) _skip.emplace(12);
for(size_t i = 0; i < gotcha_capacity; ++i)
{
auto&& _id = _data.at(i).tool_id;
@@ -132,8 +134,12 @@ pthread_mutex_gotcha::configure()
"pthread_rwlock_unlock" });
}
pthread_mutex_gotcha_t::configure(
comp::gotcha_config<8, int, pthread_barrier_t*>{ "pthread_barrier_wait" });
if(config::get_trace_thread_barriers())
{
pthread_mutex_gotcha_t::configure(
comp::gotcha_config<8, int, pthread_barrier_t*>{
"pthread_barrier_wait" });
}
if(config::get_trace_thread_spin_locks())
{
@@ -149,8 +155,11 @@ pthread_mutex_gotcha::configure()
"pthread_spin_unlock" });
}
pthread_mutex_gotcha_t::configure(
comp::gotcha_config<12, int, pthread_t, void**>{ "pthread_join" });
if(config::get_trace_thread_join())
{
pthread_mutex_gotcha_t::configure(
comp::gotcha_config<12, int, pthread_t, void**>{ "pthread_join" });
}
};
}
@@ -459,6 +459,14 @@ configure_settings(bool _init)
"cause deadlocks with MPI distributions.",
false, "backend", "parallelism", "gotcha", "advanced");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_TRACE_THREAD_BARRIERS",
"Enable tracing calls to pthread_barrier functions.", true,
"backend", "parallelism", "gotcha", "advanced");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_TRACE_THREAD_JOIN",
"Enable tracing calls to pthread_join functions.", true,
"backend", "parallelism", "gotcha", "advanced");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_SAMPLING_KEEP_INTERNAL",
"Configure whether the statistical samples should include call-stack entries "
@@ -601,7 +609,6 @@ configure_settings(bool _init)
_config->get_max_thread_bookmarks() = 1;
_config->get_timing_units() = "sec";
_config->get_memory_units() = "MB";
_config->get_papi_events() = "PAPI_TOT_CYC";
// settings native to timemory but critically and/or extensively used by omnitrace
auto _add_omnitrace_category = [&_config](auto itr) {
@@ -685,21 +692,22 @@ configure_settings(bool _init)
if(_fparanoid) _fparanoid >> _paranoid;
}
if(_paranoid > 1)
if(_paranoid > 2)
{
OMNITRACE_BASIC_VERBOSE(0,
"/proc/sys/kernel/perf_event_paranoid has a value of %i. "
"Disabling PAPI (requires a value <= 1)...\n",
"Disabling PAPI (requires a value <= 2)...\n",
_paranoid);
OMNITRACE_BASIC_VERBOSE(0,
"In order to enable PAPI support, run 'echo N | sudo tee "
"/proc/sys/kernel/perf_event_paranoid' where N is < 2\n");
tim::trait::runtime_enabled<comp::papi_common<void>>::set(false);
tim::trait::runtime_enabled<comp::papi_array_t>::set(false);
tim::trait::runtime_enabled<comp::papi_vector>::set(false);
tim::trait::runtime_enabled<comp::cpu_roofline_flops>::set(false);
tim::trait::runtime_enabled<comp::cpu_roofline_dp_flops>::set(false);
tim::trait::runtime_enabled<comp::cpu_roofline_sp_flops>::set(false);
OMNITRACE_BASIC_VERBOSE(
0, "In order to enable PAPI support, run 'echo N | sudo tee "
"/proc/sys/kernel/perf_event_paranoid' where N is <= 2\n");
trait::runtime_enabled<comp::papi_config>::set(false);
trait::runtime_enabled<comp::papi_common<void>>::set(false);
trait::runtime_enabled<comp::papi_array_t>::set(false);
trait::runtime_enabled<comp::papi_vector>::set(false);
trait::runtime_enabled<comp::cpu_roofline_flops>::set(false);
trait::runtime_enabled<comp::cpu_roofline_dp_flops>::set(false);
trait::runtime_enabled<comp::cpu_roofline_sp_flops>::set(false);
_config->get_papi_events() = std::string{};
}
else
@@ -812,6 +820,14 @@ configure_settings(bool _init)
tim::set_env(std::string{ _dl_verbose->first }, _dl_verbose->second->as_string(),
0);
if(_config->get_papi_events().empty())
{
trait::runtime_enabled<comp::papi_config>::set(false);
trait::runtime_enabled<comp::papi_common<void>>::set(false);
trait::runtime_enabled<comp::papi_array_t>::set(false);
trait::runtime_enabled<comp::papi_vector>::set(false);
}
configure_mode_settings();
configure_signal_handler();
configure_disabled_settings();
@@ -1886,6 +1902,20 @@ get_trace_thread_spin_locks()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_trace_thread_barriers()
{
static auto _v = get_config()->find("OMNITRACE_TRACE_THREAD_BARRIERS");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_trace_thread_join()
{
static auto _v = get_config()->find("OMNITRACE_TRACE_THREAD_JOIN");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_debug_tid()
{
@@ -322,6 +322,12 @@ get_trace_thread_rwlocks();
bool
get_trace_thread_spin_locks();
bool
get_trace_thread_barriers();
bool
get_trace_thread_join();
std::string
get_rocm_events();
} // namespace config