ファイル
rocm-systems/source/lib/rocprof-sys/library.cpp
T
Sohaib Nadeem 0e535daa93 Initialization fixes (#154)
- Remove tooling initialization from rocprofiler_configure:
when rocprofiler configure is called from __hip_module_ctor
(which in turn is called as a global constructor when loading shared
libraries or before main in a hip program), initializing tooling
in it can cause problems because it is too early to do some of the tasks
that it involves (e.g. opening shared libraries, creating threads).
Instead, we rely on rocprofsys_main to initialize tooling later.

- Skip rocprofiler_configure if ROCPROFSYS_PRELOAD is not set since
preload is required for tooling (such as perfetto, which is used by
the rocprofiler callbacks) to be initialized.

- Revert RCCL initialization changes: These are no longer needed since rocprofsys_init_tooling_hidden will not
be called from rocprofiler_configure

- Force rocprofiler_configure in rocprofsys_init_tooling_hidden if it hasn't been
called through __hip_module_ctor global constructor
2025-04-21 17:04:24 -04:00

1020 行
34 KiB
C++

// MIT License
//
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <timemory/log/color.hpp>
//
// above should always be included first
//
#include "api.hpp"
#include "common/setup.hpp"
#include "common/static_object.hpp"
#include "core/categories.hpp"
#include "core/components/fwd.hpp"
#include "core/concepts.hpp"
#include "core/config.hpp"
#include "core/constraint.hpp"
#include "core/debug.hpp"
#include "core/defines.hpp"
#include "core/dynamic_library.hpp"
#include "core/gpu.hpp"
#include "core/locking.hpp"
#include "core/perfetto_fwd.hpp"
#include "core/timemory.hpp"
#include "core/utility.hpp"
#include "library/causal/data.hpp"
#include "library/causal/experiment.hpp"
#include "library/causal/sampling.hpp"
#include "library/components/exit_gotcha.hpp"
#include "library/components/fork_gotcha.hpp"
#include "library/components/mpi_gotcha.hpp"
#include "library/components/numa_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/vaapi_gotcha.hpp"
#include "library/coverage.hpp"
#include "library/ompt.hpp"
#include "library/process_sampler.hpp"
#include "library/ptl.hpp"
#include "library/rcclp.hpp"
#include "library/rocprofiler-sdk.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include "library/thread_info.hpp"
#include "library/tracing.hpp"
#include "rocprofiler-systems/categories.h" // in rocprof-sys-user
#include <timemory/hash/types.hpp>
#include <timemory/log/logger.hpp>
#include <timemory/manager/manager.hpp>
#include <timemory/mpl/type_traits.hpp>
#include <timemory/operations/types/file_output_message.hpp>
#include <timemory/process/process.hpp>
#include <timemory/process/threading.hpp>
#include <timemory/settings/types.hpp>
#include <timemory/signals/signal_handlers.hpp>
#include <timemory/signals/signal_mask.hpp>
#include <timemory/signals/types.hpp>
#include <timemory/units.hpp>
#include <timemory/utility/backtrace.hpp>
#include <timemory/utility/join.hpp>
#include <timemory/utility/procfs/maps.hpp>
#include <atomic>
#include <chrono>
#include <csignal>
#include <cstdio>
#include <cstdlib>
#include <mutex>
#include <pthread.h>
#include <stdexcept>
#include <string_view>
#include <utility>
using namespace rocprofsys;
//======================================================================================//
namespace rocprofsys
{
namespace timeout
{
void
setup() ROCPROFSYS_INTERNAL_API;
}
} // namespace rocprofsys
namespace
{
auto _timemory_manager = tim::manager::instance();
auto _timemory_settings = tim::settings::shared_instance();
bool
ensure_initialization(bool _offset, int64_t _glob_n, int64_t _offset_n)
{
auto _exit_info = component::exit_gotcha::get_exit_info();
if(_exit_info.is_known && _exit_info.exit_code != EXIT_SUCCESS) return _offset;
auto _tid = utility::get_thread_index();
auto _peak_num_threads = grow_data(_tid + 1);
if(_tid > 0 && _tid < _peak_num_threads)
{
const auto& _info = thread_info::get();
ROCPROFSYS_BASIC_VERBOSE_F(3,
"thread info: %s, offset: %s, global counter: %li, "
"offset counter: %li, max threads: %li\n",
std::to_string(static_cast<bool>(_info)).c_str(),
std::to_string(_offset).c_str(), _glob_n, _offset_n,
_peak_num_threads);
}
return _offset;
}
void
finalization_handler()
{
if(get_state() == State::Active) rocprofsys_finalize();
}
auto
ensure_finalization(bool _static_init = false)
{
if(config::set_signal_handler(nullptr) == nullptr)
config::set_signal_handler(&finalization_handler);
if(_static_init)
{
auto _idx = threading::add_callback(&ensure_initialization);
if(_idx < 0)
throw exception<std::runtime_error>("failure adding threading callback");
}
ROCPROFSYS_CI_BASIC_THROW(
config::set_signal_handler(nullptr) != &finalization_handler,
"Assignment of signal handler failed. signal handler is %s, expected %s\n",
as_hex(reinterpret_cast<void*>(config::set_signal_handler(nullptr))).c_str(),
as_hex(reinterpret_cast<void*>(&finalization_handler)).c_str());
const auto& _info = thread_info::init();
const auto& _tid = _info->index_data;
if(_tid)
{
ROCPROFSYS_CI_THROW(_tid->sequent_value != threading::get_id(),
"Error! internal tid != %li :: %li", threading::get_id(),
_tid->sequent_value);
ROCPROFSYS_CI_THROW(_tid->system_value != threading::get_sys_tid(),
"Error! system tid != %li :: %li", threading::get_sys_tid(),
_tid->system_value);
}
if(common::get_env("ROCPROFSYS_MONOCHROME", false)) tim::log::monochrome() = true;
timeout::setup();
(void) tim::manager::instance();
(void) tim::settings::shared_instance();
if(!tim::get_shared_ptr_pair_callback())
{
tim::get_shared_ptr_pair_callback() =
new tim::shared_ptr_pair_callback_t{ [](int64_t _n) {
if(_n == 0) rocprofsys_finalize_hidden();
} };
}
if(_static_init)
{
ROCPROFSYS_BASIC_DEBUG_F("\n");
auto _verbose =
get_verbose_env() + ((get_debug_env() || get_debug_init()) ? 16 : 0);
auto _search_paths = JOIN(':', tim::get_env<std::string>("ROCPROFSYS_PATH", ""),
tim::get_env<std::string>("PWD"), ".",
tim::get_env<std::string>("LD_LIBRARY_PATH", ""),
tim::get_env<std::string>("LIBRARY_PATH", ""),
tim::get_env<std::string>("PATH", ""));
common::setup_environ(_verbose, _search_paths);
}
else
{
ROCPROFSYS_DEBUG_F("\n");
}
if(_timemory_manager) _timemory_manager->set_write_metadata(-1);
return scope::destructor{ []() { rocprofsys_finalize_hidden(); } };
}
template <typename... Tp>
struct fini_bundle
{
using data_type = std::tuple<Tp...>;
ROCPROFSYS_DEFAULT_OBJECT(fini_bundle)
fini_bundle(std::string_view _label)
: m_label{ _label }
{}
template <typename... Args>
void start(Args&&... _args)
{
TIMEMORY_FOLD_EXPRESSION(tim::operation::start<Tp>{}(
std::get<Tp>(m_data), std::forward<Args>(_args)...));
}
template <typename... Args>
void stop(Args&&... _args)
{
TIMEMORY_FOLD_EXPRESSION(tim::operation::stop<Tp>{}(
std::get<Tp>(m_data), std::forward<Args>(_args)...));
}
std::string as_string(bool _print_prefix = true) const
{
std::stringstream _ss;
if(_print_prefix && m_label.length() > 0) _ss << m_label << " : ";
_ss << timemory::join::join(", ", std::get<Tp>(m_data)...);
return _ss.str();
}
std::string_view m_label = {};
data_type m_data = {};
};
template <typename... Tp>
struct fini_bundle<tim::lightweight_tuple<Tp...>>
{
using base_type = fini_bundle<Tp...>;
};
using fini_bundle_t = typename fini_bundle<main_bundle_t>::base_type;
} // namespace
//======================================================================================//
///
///
///
//======================================================================================//
namespace
{
struct set_env_s // NOLINT
{};
} // namespace
extern "C" void
rocprofsys_set_env_hidden(const char* env_name, const char* env_val)
{
tim::auto_lock_t _lk{ tim::type_mutex<set_env_s>() };
static auto _set_envs = std::set<std::string_view>{};
bool _success = _set_envs.emplace(env_name).second;
// just search env to avoid initializing the settings
ROCPROFSYS_CONDITIONAL_PRINT_F(get_debug_init() || get_verbose_env() > 2,
"Setting env: %s=%s\n", env_name, env_val);
tim::set_env(env_name, env_val, 0);
if(_success && get_state() >= State::Init)
{
ROCPROFSYS_WARNING_F(
0,
"rocprofsys_set_env(\"%s\", \"%s\") called after rocprof-sys was "
"initialized. "
"state = %s. This environment variable will have no effect\n",
env_name, env_val, std::to_string(get_state()).c_str());
}
}
//======================================================================================//
///
///
///
//======================================================================================//
namespace
{
bool _set_mpi_called = false;
std::function<void()> _preinit_callback = []() { get_preinit_bundle()->start(); };
void
rocprofsys_preinit_hidden()
{
// run once and discard
_preinit_callback();
_preinit_callback = []() {};
}
} // namespace
extern "C" void
rocprofsys_set_mpi_hidden(bool use, bool attached)
{
static bool _once = false;
static auto _args = std::make_pair(use, attached);
// this function may be called multiple times if multiple libraries are instrumented
// we want to guard against multiple calls which with different arguments
if(_once && std::tie(_args.first, _args.second) == std::tie(use, attached)) return;
_once = true;
// just search env to avoid initializing the settings
ROCPROFSYS_CONDITIONAL_PRINT_F(get_debug_init() || get_verbose_env() > 2,
"use: %s, attached: %s\n", (use) ? "y" : "n",
(attached) ? "y" : "n");
_set_mpi_called = true;
config::is_attached() = attached;
if(use && !attached && get_state() == State::PreInit)
{
tim::set_env("ROCPROFSYS_USE_PID", "ON", 1);
}
else if(!use)
{
trait::runtime_enabled<mpi_gotcha_t>::set(false);
}
if(get_state() >= State::Init)
{
ROCPROFSYS_WARNING_F(
0,
"rocprofsys_set_mpi(use=%s, attached=%s) called after rocprof-sys was "
"initialized. state = %s. MPI support may not be properly initialized. Use "
"ROCPROFSYS_USE_MPIP=ON and ROCPROFSYS_USE_PID=ON to ensure full support\n",
std::to_string(use).c_str(), std::to_string(attached).c_str(),
std::to_string(get_state()).c_str());
}
rocprofsys_preinit_hidden();
}
//======================================================================================//
extern "C" void
rocprofsys_init_library_hidden()
{
auto _tid = threading::get_id();
(void) _tid;
static bool _once = false;
auto _debug_init = get_debug_init();
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s...\n",
std::to_string(get_state()).c_str());
ROCPROFSYS_CI_THROW(get_state() != State::PreInit, "State is not PreInit :: %s",
std::to_string(get_state()).c_str());
if(get_state() != State::PreInit || get_state() == State::Init || _once) return;
_once = true;
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s. Setting to %s...\n",
std::to_string(get_state()).c_str(),
std::to_string(State::Init).c_str());
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(
_debug_init, "Calling backtrace once so that the one-time call of malloc in "
"glibc's backtrace() occurs...\n");
{
std::stringstream _ss{};
timemory_print_backtrace<16>(_ss);
(void) _ss;
}
set_state(State::Init);
ROCPROFSYS_CI_THROW(get_state() != State::Init,
"set_state(State::Init) failed. state is %s",
std::to_string(get_state()).c_str());
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "Configuring settings...\n");
// configure the settings
configure_settings();
auto _debug_value = get_debug();
if(_debug_init) config::set_setting_value("ROCPROFSYS_DEBUG", true);
scope::destructor _debug_dtor{ [_debug_value, _debug_init]() {
if(_debug_init) config::set_setting_value("ROCPROFSYS_DEBUG", _debug_value);
} };
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n");
}
//======================================================================================//
extern "C" bool
rocprofsys_init_tooling_hidden()
{
if(get_env("ROCPROFSYS_MONOCHROME", false, false)) tim::log::monochrome() = true;
if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true))
{
rocprofsys_init_library_hidden();
return false;
}
#if ROCPROFSYS_USE_ROCM > 0
dynamic_library _amdhip64{ "ROCPROFSYS_ROCTRACER_LIBAMDHIP64",
find_library_path("libamdhip64.so",
{ "ROCPROFSYS_ROCM_PATH", "ROCM_PATH" },
{ ROCPROFSYS_DEFAULT_ROCM_PATH }) };
#endif
static bool _once = false;
static auto _debug_init = get_debug_init();
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s...\n",
std::to_string(get_state()).c_str());
if(get_state() != State::PreInit || get_state() == State::Init || _once) return false;
_once = true;
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
ROCPROFSYS_CONDITIONAL_THROW(
get_state() == State::Init,
"%s called after rocprofsys_init_library() was explicitly called",
ROCPROFSYS_FUNCTION);
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(get_verbose_env() >= 0,
"Instrumentation mode: %s\n",
std::to_string(config::get_mode()).c_str());
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "Printing banner...\n");
if(get_verbose_env() >= 0) print_banner();
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init,
"Calling rocprofsys_init_library()...\n");
rocprofsys_init_library_hidden();
ROCPROFSYS_DEBUG_F("\n");
auto _dtor = scope::destructor{ []() {
// if set to finalized, don't continue
if(get_state() > State::Active) return;
if(get_use_process_sampling())
{
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
process_sampler::setup();
}
if(get_use_causal())
{
{
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
causal::sampling::setup();
}
push_enable_sampling_on_child_threads(get_use_causal());
sampling::unblock_signals();
}
else if(get_use_sampling())
{
{
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
sampling::setup();
}
push_enable_sampling_on_child_threads(get_use_sampling());
sampling::unblock_signals();
}
get_main_bundle()->start();
ROCPROFSYS_DEBUG_F("State: %s -> State::Active\n",
std::to_string(get_state()).c_str());
set_state(State::Active); // set to active as very last operation
} };
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
// ideally these have already been started
rocprofsys_preinit_hidden();
// start these gotchas once settings have been initialized
if(get_init_bundle()) get_init_bundle()->start();
if(get_use_vaapi_tracing())
{
ROCPROFSYS_VERBOSE_F(1, "Setting up VA-API traces...\n");
component::vaapi_gotcha::start();
}
if(get_use_sampling()) sampling::block_signals();
// perfetto initialization
if(get_use_perfetto())
{
ROCPROFSYS_VERBOSE_F(1, "Setting up Perfetto...\n");
rocprofsys::perfetto::setup();
}
tasking::setup();
if(get_use_causal()) causal::start_experimenting();
if(get_use_timemory())
{
comp::user_global_bundle::global_init();
std::set<int> _comps{};
// convert string into set of enumerations
for(auto&& itr : tim::delimit(tim::settings::global_components()))
_comps.emplace(tim::runtime::enumerate(itr));
if(_comps.size() == 1 && _comps.find(TIMEMORY_WALL_CLOCK) != _comps.end())
{
// using wall_clock directly is lower overhead than using it via user_bundle
instrumentation_bundle_t::get_initializer() =
[](instrumentation_bundle_t& _bundle) {
_bundle.initialize<comp::wall_clock>();
};
}
else if(!_comps.empty())
{
// use user_bundle for other than wall-clock
instrumentation_bundle_t::get_initializer() =
[](instrumentation_bundle_t& _bundle) {
_bundle.initialize<comp::user_global_bundle>();
};
}
else
{
tim::trait::runtime_enabled<project::rocprofsys>::set(false);
}
}
if(get_use_ompt())
{
ROCPROFSYS_VERBOSE_F(1, "Setting up OMPT...\n");
ompt::setup();
}
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
// Force rocprofiler_configure if it hasn't been called through __hip_module_ctor.
// rocprofiler_configure needs to be called before rcclp::setup to decide
// whether we want to use gotcha wrappers for rccl or rocpofiler based tracing.
if(get_use_rocm())
{
ROCPROFSYS_VERBOSE_F(1, "Setting up ROCm...\n");
rocprofiler_sdk::setup();
}
#endif
if(get_use_rcclp())
{
ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n");
rcclp::setup();
}
if(get_use_perfetto())
{
ROCPROFSYS_VERBOSE_F(1, "Starting Perfetto...\n");
rocprofsys::perfetto::start();
}
categories::setup();
// if static objects are destroyed in the inverse order of when they are
// created this should ensure that finalization is called before perfetto
// ends the tracing session
static auto _ensure_finalization = ensure_finalization();
return true;
}
//======================================================================================//
extern "C" void
rocprofsys_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _argv0_c)
{
static int _total_count = 0;
static auto _args = std::make_pair(std::string_view{ _mode }, _is_binary_rewrite);
auto _count = _total_count++;
auto _mode_sv = std::string_view{ _mode };
auto _argv0 = (_argv0_c) ? std::string{ _argv0_c } : config::get_exe_name();
// this function may be called multiple times if multiple libraries are instrumented
// we want to guard against multiple calls which with different arguments
if(_count > 0 &&
std::tie(_args.first, _args.second) == std::tie(_mode_sv, _is_binary_rewrite))
return;
ROCPROFSYS_CONDITIONAL_THROW(
_count > 0 &&
std::tie(_args.first, _args.second) != std::tie(_mode_sv, _is_binary_rewrite),
"\nrocprofsys_init(...) called multiple times with different arguments for mode "
"and/or is_binary_rewrite:"
"\n Invocation #1: rocprofsys_init(mode=%-8s, is_binary_rewrite=%-5s, ...)"
"\n Invocation #%i: rocprofsys_init(mode=%-8s, is_binary_rewrite=%-5s, ...)",
_args.first.data(), std::to_string(_args.second).c_str(), _count + 1, _mode,
std::to_string(_is_binary_rewrite).c_str());
// always the first
(void) get_state();
(void) tracing::push_count();
(void) tracing::pop_count();
if(get_state() >= State::Init)
{
if(std::string_view{ _mode } != "trace" && std::string_view{ _mode } != "Trace")
{
ROCPROFSYS_WARNING_F(
0,
"rocprofsys_init(mode=%s, is_binary_rewrite=%s, argv0=%s) "
"called after rocprof-sys was initialized. state = %s. Mode-based "
"settings (via -M <MODE> passed to rocprof-sys exe) may not be "
"properly configured.\n",
_mode, std::to_string(_is_binary_rewrite).c_str(), _argv0.c_str(),
std::to_string(get_state()).c_str());
}
}
tracing::get_finalization_functions().emplace_back([_argv0_c]() {
ROCPROFSYS_CI_THROW(get_state() != State::Active,
"Finalizer function for popping main invoked in non-active "
"state :: state = %s\n",
std::to_string(get_state()).c_str());
if(get_state() == State::Active)
{
auto _name = (_argv0_c) ? std::string{ _argv0_c } : config::get_exe_name();
// if main hasn't been popped yet, pop it
ROCPROFSYS_BASIC_VERBOSE(2, "Running rocprofsys_pop_trace(%s)...\n",
_name.c_str());
rocprofsys_pop_trace_hidden(_name.c_str());
}
});
std::atexit([]() {
// if active (not already finalized) then we should finalize
if(get_state() == State::Active) rocprofsys_finalize_hidden();
});
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(
get_debug_env() || get_verbose_env() > 2,
"mode: %s | is binary rewrite: %s | command: %s\n", _mode,
(_is_binary_rewrite) ? "y" : "n", _argv0.c_str());
tim::set_env("ROCPROFSYS_MODE", _mode, 0);
config::is_binary_rewrite() = _is_binary_rewrite;
if(_set_mpi_called)
{
rocprofsys_preinit_hidden();
}
}
//======================================================================================//
extern "C" void
rocprofsys_reset_preload_hidden(void)
{
tim::set_env("ROCPROFSYS_PRELOAD", "0", 1);
auto&& _preload_libs = common::get_env("LD_PRELOAD", std::string{});
if(_preload_libs.find("librocprof-sys") != std::string::npos)
{
auto _modified_preload = std::string{};
for(const auto& itr : delimit(_preload_libs, ":"))
{
if(itr.find("librocprof-sys") != std::string::npos) continue;
_modified_preload += common::join("", ":", itr);
}
if(!_modified_preload.empty() && _modified_preload.find(':') == 0)
_modified_preload = _modified_preload.substr(1);
tim::set_env("LD_PRELOAD", _modified_preload, 1);
}
}
//======================================================================================//
extern "C" void
rocprofsys_finalize_hidden(void)
{
// disable thread id recycling during finalization
threading::recycle_ids() = false;
// disable initialization callback
threading::remove_callback(&ensure_initialization);
bool _is_child = is_child_process();
set_thread_state(ThreadState::Completed);
// return if not active
if(get_state() != State::Active)
{
ROCPROFSYS_BASIC_DEBUG_F("State = %s. Finalization skipped\n",
std::to_string(get_state()).c_str());
return;
}
else if(_is_child)
{
set_state(State::Finalized);
std::quick_exit(EXIT_SUCCESS);
return;
}
if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n");
ROCPROFSYS_VERBOSE_F(0, "finalizing...\n");
sampling::block_samples();
thread_info::set_stop(comp::wall_clock::record());
tim::signals::block_signals(get_sampling_signals(),
tim::signals::sigmask_scope::process);
rocprofsys_reset_preload_hidden();
// some functions called during finalization may alter the push/pop count so we need
// to save them here
auto _push_count = tracing::push_count().load();
auto _pop_count = tracing::pop_count().load();
// e.g. rocprofsys_pop_trace("main");
if(_push_count > _pop_count)
{
for(auto& itr : tracing::get_finalization_functions())
{
itr();
++_pop_count;
}
}
set_state(State::Finalized);
push_enable_sampling_on_child_threads(false);
set_sampling_on_all_future_threads(false);
// if the categories are not enabled, it can/will suppress generating output for data
// in category
categories::enable_categories();
auto _debug_init = get_debug_finalize();
auto _debug_value = get_debug();
if(_debug_init) config::set_setting_value("ROCPROFSYS_DEBUG", true);
scope::destructor _debug_dtor{ [_debug_value, _debug_init]() {
if(_debug_init) config::set_setting_value("ROCPROFSYS_DEBUG", _debug_value);
} };
auto& _thread_bundle = thread_data<thread_bundle_t>::instance();
if(_thread_bundle) _thread_bundle->stop();
if(get_verbose() >= 1 || get_debug())
{
if(dmp::rank() == 0)
{
ROCPROFSYS_PRINT_F("\n");
config::print_settings(
tim::get_env<bool>("ROCPROFSYS_PRINT_ENV", get_debug()));
}
}
ROCPROFSYS_VERBOSE_F(1, "rocprofsys_push_trace :: called %zux\n", _push_count);
ROCPROFSYS_VERBOSE_F(1, "rocprofsys_pop_trace :: called %zux\n", _pop_count);
tim::signals::enable_signal_detection({ tim::signals::sys_signal::Interrupt },
[](int) {});
ROCPROFSYS_DEBUG_F("Copying over all timemory hash information to main thread...\n");
tracing::copy_timemory_hash_ids();
// stop the main bundle which has stats for run
if(get_main_bundle())
{
ROCPROFSYS_DEBUG_F("Stopping main bundle...\n");
get_main_bundle()->stop();
}
fini_bundle_t _finalization{};
_finalization.start();
if(get_use_vaapi_tracing())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down VA-API tracing...\n");
component::vaapi_gotcha::shutdown();
}
if(get_use_rcclp())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down RCCLP...\n");
rcclp::shutdown();
}
if(get_use_ompt())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down OMPT...\n");
ompt::shutdown();
}
#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0
if(get_use_rocm())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down ROCm...\n");
rocprofiler_sdk::shutdown();
}
#endif
ROCPROFSYS_DEBUG_F("Stopping and destroying instrumentation bundles...\n");
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
{
if(!instrumentation_bundles::get()) continue;
const auto& _info = thread_info::get(i, SequentTID);
auto& itr = instrumentation_bundles::get()->at(i);
while(itr != nullptr && !itr->empty())
{
int _lvl = 1;
if(_info->is_offset)
{
++_pop_count;
_lvl = 4;
}
ROCPROFSYS_VERBOSE_F(
_lvl,
"Warning! instrumentation bundle on thread %zu (TID=%li) "
"with label '%s' was not stopped.\n",
i, itr->back()->tid(), itr->back()->key().c_str());
itr->back()->stop();
itr->back()->pop();
itr->pop_back();
}
}
// stop the main gotcha which shuts down the pthread gotchas
if(get_init_bundle())
{
ROCPROFSYS_DEBUG_F("Stopping main gotcha...\n");
get_init_bundle()->stop();
pthread_gotcha::shutdown();
component::numa_gotcha::shutdown();
}
// stop the gotcha bundle
if(get_preinit_bundle())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down miscellaneous gotchas...\n");
get_preinit_bundle()->stop();
component::mpi_gotcha::shutdown();
}
if(get_use_process_sampling())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down background sampler...\n");
process_sampler::shutdown();
}
if(get_use_causal())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down causal sampling...\n");
causal::sampling::shutdown();
}
if(get_use_sampling())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down sampling...\n");
sampling::shutdown();
}
ROCPROFSYS_VERBOSE_F(3, "Reporting the process- and thread-level metrics...\n");
// report the high-level metrics for the process
if(get_main_bundle())
{
ROCPROFSYS_VERBOSE_F(0, "\n");
std::string _msg = JOIN("", *get_main_bundle());
auto _pos = _msg.find(">>> ");
if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5);
ROCPROFSYS_VERBOSE_F(0, "%s\n", _msg.c_str());
ROCPROFSYS_DEBUG_F("Resetting main bundle...\n");
get_main_bundle()->reset();
}
// print out thread-data if they are not still running
// if they are still running (e.g. thread-pool still alive), the
// thread-specific data will be wrong if try to stop them from
// the main thread.
auto _thr_verbose = (config::get_use_causal()) ? 1 : 0;
if(thread_data<thread_bundle_t>::get())
{
for(auto& itr : *thread_data<thread_bundle_t>::get())
{
if(itr && itr->get<comp::wall_clock>() &&
!itr->get<comp::wall_clock>()->get_is_running())
{
std::string _msg = JOIN("", *itr);
auto _pos = _msg.find(">>> ");
if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5);
ROCPROFSYS_VERBOSE_F(_thr_verbose, "%s\n", _msg.c_str());
}
}
}
ROCPROFSYS_VERBOSE_F(0, "\n");
// ensure that all the MT instances are flushed
if(get_use_sampling())
{
ROCPROFSYS_VERBOSE_F(1, "Post-processing the sampling backtraces...\n");
sampling::post_process();
}
if(get_use_causal())
{
ROCPROFSYS_VERBOSE_F(1, "Finishing the causal experiments...\n");
causal::finish_experimenting();
}
if(get_use_process_sampling())
{
ROCPROFSYS_VERBOSE_F(1, "Post-processing the system-level samples...\n");
process_sampler::post_process();
}
// shutdown tasking before timemory is finalized
ROCPROFSYS_VERBOSE_F(1, "Shutting down thread-pools...\n");
tasking::shutdown();
if(get_use_code_coverage())
{
ROCPROFSYS_VERBOSE_F(1, "Post-processing the code coverage...\n");
coverage::post_process();
}
tracing::copy_timemory_hash_ids();
bool _perfetto_output_error = false;
if(get_use_perfetto())
{
ROCPROFSYS_VERBOSE_F(0, "Finalizing perfetto...\n");
rocprofsys::perfetto::post_process(_timemory_manager.get(),
_perfetto_output_error);
}
if(_timemory_manager && _timemory_manager != nullptr)
{
_timemory_manager->add_metadata([](auto& ar) {
auto _maps = tim::procfs::read_maps(process::get_id());
auto _libs = std::set<std::string>{};
for(auto& itr : _maps)
{
auto&& _path = itr.pathname;
if(!_path.empty() && _path.at(0) != '[' && filepath::exists(_path))
_libs.emplace(_path);
}
ar(tim::cereal::make_nvp("memory_maps_files", _libs),
tim::cereal::make_nvp("memory_maps", _maps));
});
ROCPROFSYS_VERBOSE_F(1, "Finalizing timemory...\n");
tim::timemory_finalize(_timemory_manager.get());
auto _cfg = settings::compose_filename_config{};
_cfg.use_suffix = config::get_use_pid();
_cfg.suffix = settings::default_process_suffix();
_timemory_manager->write_metadata(settings::get_global_output_prefix(),
"rocprofsys", _cfg);
}
categories::shutdown();
_finalization.stop();
if(_perfetto_output_error)
{
ROCPROFSYS_THROW("Error opening perfetto output file: %s",
get_perfetto_output_filename().c_str());
}
ROCPROFSYS_CI_THROW(
_push_count > _pop_count, "%s",
TIMEMORY_JOIN(" ",
"rocprofsys_push_trace was called more times than "
"rocprofsys_pop_trace. The inverse is fine but the current state "
"means not every measurement was ended :: pushed:",
_push_count, "vs. popped:", _pop_count)
.c_str());
debug::close_file();
config::finalize();
ROCPROFSYS_VERBOSE_F(0, "Finalized: %s\n", _finalization.as_string().c_str());
tim::signals::enable_signal_detection(
{ tim::signals::sys_signal::SegFault, tim::signals::sys_signal::Stop },
[](int) {});
common::destroy_static_objects();
}
//======================================================================================//
namespace
{
// if static objects are destroyed randomly (relatively uncommon behavior)
// this might call finalization before perfetto ends the tracing session
// but static variable in rocprofsys_init_tooling_hidden is more likely
auto _ensure_finalization = ensure_finalization(true);
auto _manager = tim::manager::instance();
auto _settings = tim::settings::shared_instance();
} // namespace