// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "api.hpp" #include "core/common.hpp" #include "core/config.hpp" #include "core/debug.hpp" #include "core/defines.hpp" #include #if defined(ROCPROFSYS_USE_OMPT) && ROCPROFSYS_USE_OMPT > 0 # include "binary/link_map.hpp" # include "core/components/fwd.hpp" # include "library/components/category_region.hpp" # include "library/tracing.hpp" # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include using api_t = tim::project::rocprofsys; namespace rocprofsys { namespace component { struct ompt : comp::base { using value_type = void; using base_type = comp::base; using context_info_t = tim::openmp::context_info; static std::string label() { return "ompt"; } static std::string description() { return "OpenMP tools tracing"; } ompt() = default; ~ompt() = default; ompt(const ompt&) = default; ompt(ompt&&) noexcept = default; ompt& operator=(const ompt&) = default; ompt& operator=(ompt&&) noexcept = default; template void start(const context_info_t& _ctx_info, Args&&...) const { category_region::start(m_prefix); auto _ts = tracing::now(); uint64_t _cid = (_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0; auto _annotate = [&](::perfetto::EventContext ctx) { if(config::get_perfetto_annotations()) { tracing::add_perfetto_annotation(ctx, "begin_ns", _ts); for(const auto& itr : _ctx_info.arguments) tracing::add_perfetto_annotation(ctx, itr.label, itr.value); } }; if(_cid > 0) { category_region::start( (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, ::perfetto::Flow::ProcessScoped(_cid), std::move(_annotate)); } else { category_region::start( (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, std::move(_annotate)); } } template void stop(const context_info_t& _ctx_info, Args&&...) const { category_region::stop(m_prefix); auto _ts = tracing::now(); uint64_t _cid = (_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0; auto _annotate = [&](::perfetto::EventContext ctx) { if(config::get_perfetto_annotations()) { tracing::add_perfetto_annotation(ctx, "end_ns", _ts); for(const auto& itr : _ctx_info.arguments) tracing::add_perfetto_annotation(ctx, itr.label, itr.value); } }; if(_cid > 0) { category_region::stop( (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, std::move(_annotate)); } else { category_region::stop( (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, std::move(_annotate)); } } template void store(const context_info_t& _ctx_info, Args&&... _args) const { start(_ctx_info, std::forward(_args)...); stop(_ctx_info, std::forward(_args)...); } static void record(std::string_view name, ompt_id_t id, uint64_t beg_time, uint64_t end_time, uint64_t thrd_id, uint64_t targ_id, const context_info_t& common) { (void) thrd_id; (void) targ_id; auto _annotate = [&](::perfetto::EventContext ctx) { if(config::get_perfetto_annotations()) { for(const auto& itr : common.arguments) tracing::add_perfetto_annotation(ctx, itr.label, itr.value); } }; auto _track = tracing::get_perfetto_track( category::ompt{}, [](auto) -> std::string { return "OpenMP Target Offloads"; }, 0); category_region::start( name, _track, beg_time, ::perfetto::Flow::ProcessScoped(id), std::move(_annotate)); category_region::stop(name, _track, end_time); } void set_prefix(std::string_view _v) { m_prefix = _v; } private: std::string_view m_prefix = {}; }; } // namespace component } // namespace rocprofsys namespace tim { namespace trait { template <> struct ompt_handle { using type = component_tuple<::rocprofsys::component::ompt>; }; } // namespace trait } // namespace tim namespace rocprofsys { namespace ompt { namespace { using ompt_handle_t = tim::component::ompt_handle; using ompt_context_t = tim::openmp::context_handler; using ompt_toolset_t = typename ompt_handle_t::toolset_type; using ompt_bundle_t = tim::component_tuple; std::unique_ptr f_bundle = {}; bool _init_toolset_off = (trait::runtime_enabled::set(false), trait::runtime_enabled::set(false), true); tim::ompt::finalize_tool_func_t f_finalize = nullptr; } // namespace void setup() { if(!tim::settings::enabled()) return; trait::runtime_enabled::set(true); trait::runtime_enabled::set(true); tim::auto_lock_t lk{ tim::type_mutex() }; f_bundle = std::make_unique("rocprofsys/ompt", quirk::config{}); } void shutdown() { static bool _protect = false; if(_protect) return; _protect = true; if(f_bundle) { if(tim::manager::instance()) tim::manager::instance()->cleanup("rocprofsys-ompt"); f_bundle->stop(); ompt_context_t::cleanup(); trait::runtime_enabled::set(false); trait::runtime_enabled::set(false); pthread_gotcha::shutdown(); // call the OMPT finalize callback if(f_finalize) { for(const auto& itr : tim::openmp::get_ompt_device_functions()) if(itr.second.stop_trace) itr.second.stop_trace(itr.second.device); (*f_finalize)(); f_finalize = nullptr; } } f_bundle.reset(); _protect = false; } namespace { bool& use_tool() { static bool _v = false; return _v; } int tool_initialize(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t* tool_data) { if(!rocprofsys::settings_are_configured()) { ROCPROFSYS_BASIC_WARNING_F( 0, "[%s] invoked before rocprof-sys was initialized. In instrumentation mode, " "settings exported to the environment have not been propagated yet...\n", __FUNCTION__); use_tool() = get_env("ROCPROFSYS_USE_OMPT", true, false); } else { use_tool() = rocprofsys::config::get_use_ompt(); } if(use_tool()) { ROCPROFSYS_BASIC_VERBOSE_F( 2, "OpenMP-tools configuring for initial device %i\n\n", initial_device_num); static auto _generate_key = [](std::string_view _key_v, const ::tim::openmp::argument_array_t& _args_v) { return std::string{ _key_v }; (void) _args_v; }; tim::openmp::get_codeptr_ra_resolver() = [](tim::openmp::context_info& _ctx_info) { const auto& _key = _ctx_info.label; const auto* codeptr_ra = _ctx_info.codeptr_ra; auto& _args = _ctx_info.arguments; ROCPROFSYS_BASIC_VERBOSE(2, "resolving codeptr return address for %s\n", _key.data()); if(!codeptr_ra) return _generate_key(_key, _args); static thread_local auto _once = std::once_flag{}; std::call_once(_once, []() { ::tim::unwind::update_file_maps(); }); auto _info = ::rocprofsys::binary::lookup_ipaddr_entry( reinterpret_cast(codeptr_ra)); if(_info) { _ctx_info.func = tim::demangle(_info->name); if(_info->lineno > 0) { auto _linfo = _info->lineinfo.rget([](const auto& _v) -> bool { return (_v && !_v.location.empty() && _v.line > 0); }); if(_linfo) { _ctx_info.file = _linfo.location; _ctx_info.line = _linfo.line; _args.emplace_back("file", _ctx_info.file); _args.emplace_back("lineinfo", ::timemory::join::join("@", _ctx_info.file, _ctx_info.line)); } else { _ctx_info.file = _info->location; _args.emplace_back("file", _ctx_info.file); } return _generate_key( ::timemory::join::join(" @ ", _key, _ctx_info.func), _args); } else { return _generate_key( ::timemory::join::join(" @ ", _key, _ctx_info.func), _args); } } else { auto _dl_info = Dl_info{ nullptr, nullptr, nullptr, nullptr }; if(dladdr(codeptr_ra, &_dl_info) != 0) { _ctx_info.file = _dl_info.dli_fname; _ctx_info.func = tim::demangle(_dl_info.dli_sname); _args.emplace_back("file", _ctx_info.file); return _generate_key( ::timemory::join::join( " @ ", _key, ::timemory::join::join("", _ctx_info.func, " [", _ctx_info.file, "]")), _args); } } // since no line info could be deduced, include the codeptr return address auto _args_codeptr_v = _args; _args_codeptr_v.emplace_back("codeptr_ra", codeptr_ra); return _generate_key(_key, _args_codeptr_v); }; tim::openmp::get_function_lookup_callback< api_t>() = [](ompt_function_lookup_t, const std::optional& params) { if(!params) return; ROCPROFSYS_VERBOSE(3, "[ompt] configuring device %i...\n", params->device_num); auto& device_funcs = tim::openmp::get_ompt_device_functions().at(params->device_num); device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_data_op); device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_submit); static ompt_callback_buffer_request_t request = [](int device_num, ompt_buffer_t** buffer, size_t* bytes) { ROCPROFSYS_VERBOSE(3, "[ompt] buffer request...\n"); *bytes = ::tim::units::get_page_size(); *buffer = mmap(nullptr, *bytes, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); (void) device_num; }; static ompt_callback_buffer_complete_t complete = [](int device_num, ompt_buffer_t* buffer, size_t bytes, ompt_buffer_cursor_t begin, int buffer_owned) { ROCPROFSYS_VERBOSE(3, "[ompt] buffer complete...\n"); tim::consume_parameters(device_num, buffer, bytes, begin, buffer_owned); auto _funcs = tim::openmp::get_ompt_device_functions().at(device_num); auto _skew = rocprofsys::tracing::get_clock_skew( [&_funcs]() { return _funcs.get_device_time(_funcs.device); }); ompt_buffer_cursor_t _cursor = begin; size_t _nrecords = 0; do { if(_cursor == 0) break; ++_nrecords; auto* _record = _funcs.get_record_ompt(buffer, _cursor); if(_record) { const char* _type = tim::openmp::get_enum_label(_record->type); auto _thrd_id = _record->thread_id; auto _targ_id = _record->target_id; unsigned long beg_time = _record->time + _skew; unsigned long end_time = 0; ompt_id_t id = 0; const char* _name = tim::openmp::get_enum_label(_record->type); if(_record->type == ompt_callback_target_submit) { auto& _data = _record->record.target_kernel; end_time = _data.end_time + _skew; id = _data.host_op_id; auto _ctx_info = tim::openmp::argument_array_t{ { "begin_ns", beg_time }, { "end_ns", end_time }, { "type", _type }, { "thread_id", _thrd_id }, { "target_id", _targ_id }, { "host_op_id", id }, { "requested_num_teams", _data.requested_num_teams }, { "granted_num_teams", _data.granted_num_teams } }; component::ompt::record( _name, id, beg_time, end_time, _thrd_id, _targ_id, tim::openmp::context_info{ _name, nullptr, _ctx_info }); } else if(_record->type == ompt_callback_target_data_op) { auto& _data = _record->record.target_data_op; end_time = _data.end_time + _skew; id = _data.host_op_id; const auto* _opname = tim::openmp::get_enum_label(_data.optype); auto _ctx_info = tim::openmp::argument_array_t{ { "begin_ns", beg_time }, { "end_ns", end_time }, { "type", _type }, { "thread_id", _thrd_id }, { "target_id", _targ_id }, { "host_op_id", id }, { "optype", _opname }, { "src_addr", reinterpret_cast(_data.src_addr) }, { "dst_addr", reinterpret_cast(_data.dest_addr) }, { "src_device_num", _data.src_device_num }, { "dst_device_num", _data.dest_device_num }, { "bytes", _data.bytes }, }; component::ompt::record( _opname, id, beg_time, end_time, _thrd_id, _targ_id, tim::openmp::context_info{ _name, nullptr, _ctx_info }); } ROCPROFSYS_VERBOSE( 3, "type=%i, type_name=%s, start=%lu, end=%lu, delta=%lu, " "tid=%lu, target_id=%lu, host_id=%lu\n", _record->type, tim::openmp::get_enum_label(_record->type), beg_time, end_time, (end_time - beg_time), _record->thread_id, _record->target_id, id); } _funcs.advance_buffer_cursor(_funcs.device, buffer, bytes, _cursor, &_cursor); } while(_cursor != 0); ROCPROFSYS_VERBOSE(3, "[ompt] number of records: %zu\n", _nrecords); if(buffer_owned == 1) { ::munmap(buffer, bytes); } }; device_funcs.start_trace(params->device, request, complete); }; f_finalize = tim::ompt::configure(lookup, initial_device_num, tool_data); } return 1; // success } void tool_finalize(ompt_data_t*) { shutdown(); } } // namespace } // namespace ompt } // namespace rocprofsys extern "C" { ompt_start_tool_result_t* ompt_start_tool(unsigned int, const char*) ROCPROFSYS_PUBLIC_API; ompt_start_tool_result_t* ompt_start_tool(unsigned int omp_version, const char* runtime_version) { ROCPROFSYS_BASIC_VERBOSE_F(0, "OpenMP version: %u, runtime version: %s\n", omp_version, runtime_version); ROCPROFSYS_METADATA("OMP_VERSION", omp_version); ROCPROFSYS_METADATA("OMP_RUNTIME_VERSION", runtime_version); static auto* data = new ompt_start_tool_result_t{ &rocprofsys::ompt::tool_initialize, &rocprofsys::ompt::tool_finalize, { 0 } }; return data; } } #else namespace rocprofsys { namespace ompt { void setup() {} void shutdown() {} } // namespace ompt } // namespace rocprofsys #endif