// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #define TIMEMORY_KOKKOSP_POSTFIX ROCPROFSYS_PUBLIC_API #include "api.hpp" #include "core/agent_manager.hpp" #include "core/components/fwd.hpp" #include "core/config.hpp" #include "core/defines.hpp" #include "core/node_info.hpp" #include "core/perfetto.hpp" #include "core/trace_cache/cache_manager.hpp" #include "core/trace_cache/sample_type.hpp" #include "library/components/category_region.hpp" #include "library/runtime.hpp" #include #include #include #include #include #include #include #include #include #include "logger/debug.hpp" #include #include #include namespace kokkosp = ::tim::kokkosp; namespace category = ::tim::category; namespace comp = ::rocprofsys::component; using kokkosp_region = comp::local_category_region; //--------------------------------------------------------------------------------------// namespace tim { template <> inline auto invoke_preinit(long) { kokkosp::memory_tracker::label() = "kokkos_memory"; kokkosp::memory_tracker::description() = "Kokkos Memory tracker"; } } // namespace tim //--------------------------------------------------------------------------------------// namespace { std::string kokkos_banner = "#---------------------------------------------------------------------------#"; //--------------------------------------------------------------------------------------// inline void setup_kernel_logger() { if((tim::settings::debug() && tim::settings::verbose() >= 3) || rocprofsys::config::get_use_kokkosp_kernel_logger()) { kokkosp::logger_t::get_initializer() = [](kokkosp::logger_t& _obj) { _obj.initialize(); }; } } } // namespace namespace { bool _standalone_initialized = false; bool _kp_deep_copy = false; size_t _name_len_limit = 0; std::string _kp_prefix = {}; std::vector _initialize_arguments = {}; template void set_invalid_id(Tp* _v) { constexpr bool is32 = std::is_same::value; constexpr bool is64 = std::is_same::value; static_assert(is32 || is64, "only support uint32_t or uint64_t"); *_v = std::numeric_limits::max(); } template bool is_invalid_id(Tp _v) { constexpr bool is32 = std::is_same::value; constexpr bool is64 = std::is_same::value; static_assert(is32 || is64, "only support uint32_t or uint64_t"); return (_v == std::numeric_limits::max()); } template auto strlength(Tp&& _v) { using type = ::tim::concepts::unqualified_type_t; if constexpr(std::is_same::value || std::is_same::value) return _v.length(); else return strnlen(_v, std::max(_name_len_limit, 1)); } template bool violates_name_rules(Arg&& _arg, Args&&... _args) { // for causal profiling we only consider callbacks which are explicitly named if(rocprofsys::config::get_use_causal() && (std::string_view{ _arg }.find("Kokkos::") == 0 || std::string_view{ _arg }.find("Space::") != std::string_view::npos)) return true; size_t _len = (strlength(std::forward(_arg)) + ... + strlength(std::forward(_args))); // ignore labels without names if(_len == 0) return true; else if(_name_len_limit == 0) return false; return (_len >= _name_len_limit); } } // namespace namespace { void metadata_initialize_kokkos_category() { rocprofsys::trace_cache::get_metadata_registry().add_string( rocprofsys::trait::name::value); } void metadata_initialize_kokkos_track() { rocprofsys::trace_cache::get_metadata_registry().add_track( { rocprofsys::trait::name::value, std::nullopt, "{}" }); } void cache_kokkos_event(const char* name, const char* event_type, const char* target, uint64_t timestamp_ns) { nlohmann::json event_metadata; event_metadata["name"] = name; event_metadata["event_type"] = event_type; event_metadata["target"] = target; const size_t stack_id = 0; const size_t parent_stack_id = 0; const size_t correlation_id = 0; const char* call_stack = "{}"; const char* line_info = "{}"; rocprofsys::trace_cache::get_buffer_storage().store( rocprofsys::trace_cache::in_time_sample{ static_cast(rocprofsys::category_enum_id::value), rocprofsys::trait::name::value, timestamp_ns, event_metadata.dump().c_str(), stack_id, parent_stack_id, correlation_id, call_stack, line_info }); } } // namespace //--------------------------------------------------------------------------------------// extern "C" { struct Kokkos_Tools_ToolSettings { bool requires_global_fencing; bool padding[255]; }; void kokkosp_request_tool_settings(const uint32_t, Kokkos_Tools_ToolSettings*) ROCPROFSYS_PUBLIC_API; void kokkosp_dual_view_sync(const char*, const void* const, bool) ROCPROFSYS_PUBLIC_API; void kokkosp_dual_view_modify(const char*, const void* const, bool) ROCPROFSYS_PUBLIC_API; void kokkosp_print_help(char*) {} void kokkosp_parse_args(int argc, char** argv) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); if(!rocprofsys::config::settings_are_configured() && rocprofsys::get_state() < rocprofsys::State::Active) { _standalone_initialized = true; LOG_DEBUG("Parsing arguments..."); std::string _command_line = {}; for(int i = 0; i < argc; ++i) { _initialize_arguments.emplace_back(argv[i]); _command_line.append(" ").append(argv[i]); } if(_command_line.length() > 1) _command_line = _command_line.substr(1); tim::set_env("ROCPROFSYS_COMMAND_LINE", _command_line, 0); } } void kokkosp_declare_metadata(const char* key, const char* value) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); tim::manager::add_metadata(key, value); } void kokkosp_request_tool_settings(const uint32_t _version, Kokkos_Tools_ToolSettings* _settings) { if(_version > 0) _settings->requires_global_fencing = false; } void kokkosp_init_library([[maybe_unused]] const int loadSeq, [[maybe_unused]] const uint64_t interfaceVer, const uint32_t devInfoCount, void* deviceInfo) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); tim::consume_parameters(devInfoCount, deviceInfo); LOG_DEBUG( "Initializing rocprof-sys kokkos connector (sequence {}, version: {})...", loadSeq, interfaceVer); if(_standalone_initialized || (!rocprofsys::config::settings_are_configured() && rocprofsys::get_state() < rocprofsys::State::Active)) { auto _kokkos_profile_lib = tim::get_env("KOKKOS_TOOLS_LIBS"); if(_kokkos_profile_lib.find("librocprof-sys.so") != std::string::npos) { auto _maps = tim::procfs::read_maps(tim::process::get_id()); auto _libs = std::set{}; for(auto& itr : _maps) { auto&& _path = itr.pathname; if(!_path.empty() && _path.at(0) != '[' && rocprofsys::filepath::exists(_path)) _libs.emplace(_path); } for(const auto& itr : _libs) { if(itr.find("librocprof-sys-dl.so") != std::string::npos) { std::stringstream _libs_str{}; for(const auto& litr : _libs) _libs_str << "- " << litr << "\n"; LOG_CRITICAL( "{} was invoked with librocprof-sys.so as the " "KOKKOS_TOOLS_LIBS." "However, librocprof-sys-dl.so has already been loaded by " "the process. To avoid duplicate collections culminating " "is an error, please set KOKKOS_TOOLS_LIBS={}.Loaded " "libraries: {}", __FUNCTION__, itr, _libs_str.str()); ::rocprofsys::set_state(::rocprofsys::State::Finalized); std::abort(); } } } LOG_DEBUG("Initializing rocprof-sys (standalone)... "); auto _mode = tim::get_env("ROCPROFSYS_MODE", "trace"); auto _arg0 = (_initialize_arguments.empty()) ? std::string{ "unknown" } : _initialize_arguments.at(0); _standalone_initialized = true; rocprofsys_set_mpi_hidden(false, false); rocprofsys_init_hidden(_mode.c_str(), false, _arg0.c_str()); rocprofsys_push_trace_hidden("kokkos_main"); metadata_initialize_kokkos_category(); metadata_initialize_kokkos_track(); } setup_kernel_logger(); tim::trait::runtime_enabled::set( rocprofsys::config::get_use_timemory()); LOG_DEBUG("Done"); _name_len_limit = rocprofsys::config::get_setting_value( "ROCPROFSYS_KOKKOSP_NAME_LENGTH_MAX") .value_or(_name_len_limit); _kp_prefix = rocprofsys::config::get_setting_value( "ROCPROFSYS_KOKKOSP_PREFIX") .value_or(_kp_prefix); _kp_deep_copy = rocprofsys::config::get_setting_value("ROCPROFSYS_KOKKOSP_DEEP_COPY") .value_or(_kp_deep_copy); } void kokkosp_finalize_library() { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); if(_standalone_initialized) { rocprofsys_pop_trace_hidden("kokkos_main"); LOG_DEBUG("Finalizing kokkos rocprof-sys connector (standalone)..."); rocprofsys_finalize_hidden(); } else { LOG_DEBUG("Finalizing kokkos rocprof-sys connector... "); kokkosp::cleanup(); } } //----------------------------------------------------------------------------------// void kokkosp_begin_parallel_for(const char* name, uint32_t devid, uint64_t* kernid) { if(violates_name_rules(name)) return set_invalid_id(kernid); ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? fmt::format("{} {} [for]", _kp_prefix, name) : fmt::format("{} {} [for][dev{}]", _kp_prefix, name, devid); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); kokkosp::create_profiler(pname, *kernid); kokkosp::start_profiler(*kernid); } void kokkosp_end_parallel_for(uint64_t kernid) { if(is_invalid_id(kernid)) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// void kokkosp_begin_parallel_reduce(const char* name, uint32_t devid, uint64_t* kernid) { if(violates_name_rules(name)) return set_invalid_id(kernid); ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? fmt::format("{} {} [reduce]", _kp_prefix, name) : fmt::format("{} {} [reduce][dev{}]", _kp_prefix, name, devid); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); kokkosp::create_profiler(pname, *kernid); kokkosp::start_profiler(*kernid); } void kokkosp_end_parallel_reduce(uint64_t kernid) { if(is_invalid_id(kernid)) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// void kokkosp_begin_parallel_scan(const char* name, uint32_t devid, uint64_t* kernid) { if(violates_name_rules(name)) return set_invalid_id(kernid); ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? fmt::format("{} {} [scan]", _kp_prefix, name) : fmt::format("{} {} [scan][dev{}]", _kp_prefix, name, devid); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); kokkosp::create_profiler(pname, *kernid); kokkosp::start_profiler(*kernid); } void kokkosp_end_parallel_scan(uint64_t kernid) { if(is_invalid_id(kernid)) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// void kokkosp_begin_fence(const char* name, uint32_t devid, uint64_t* kernid) { if(violates_name_rules(name)) return set_invalid_id(kernid); ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? fmt::format("{} {} [fence]", _kp_prefix, name) : fmt::format("{} {} [fence][dev{}]", _kp_prefix, name, devid); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); kokkosp::create_profiler(pname, *kernid); kokkosp::start_profiler(*kernid); } void kokkosp_end_fence(uint64_t kernid) { if(is_invalid_id(kernid)) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// void kokkosp_push_profile_region(const char* name) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, name); kokkosp::get_profiler_stack() .emplace_back(kokkosp::profiler_t(name)) .start(); } void kokkosp_pop_profile_region() { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__); if(kokkosp::get_profiler_stack().empty()) return; kokkosp::get_profiler_stack().back().stop(); kokkosp::get_profiler_stack().pop_back(); } //----------------------------------------------------------------------------------// void kokkosp_create_profile_section(const char* name, uint32_t* secid) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); *secid = kokkosp::get_unique_id(); auto pname = std::string{ name }; kokkosp::create_profiler(name, *secid); } void kokkosp_destroy_profile_section(uint32_t secid) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::destroy_profiler(secid); } //----------------------------------------------------------------------------------// void kokkosp_start_profile_section(uint32_t secid) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, secid); kokkosp::start_profiler(secid); } void kokkosp_stop_profile_section(uint32_t secid) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, secid); kokkosp::stop_profiler(secid); } //----------------------------------------------------------------------------------// void kokkosp_allocate_data(const SpaceHandle space, const char* label, const void* const ptr, const uint64_t size) { if(violates_name_rules(label)) return; if(rocprofsys::config::get_use_causal()) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(0, __FUNCTION__, space.name, label, fmt::format("[{}]", ptr), size); auto pname = fmt::format("{} {} [allocate][{}]", _kp_prefix, label, space.name); kokkosp::profiler_alloc_t<>{ pname }.store(std::plus{}, size); kokkosp::profiler_t{ pname }.mark(); } void kokkosp_deallocate_data(const SpaceHandle space, const char* label, const void* const ptr, const uint64_t size) { if(violates_name_rules(label)) return; if(rocprofsys::config::get_use_causal()) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(0, __FUNCTION__, space.name, label, fmt::format("[{}]", ptr), size); auto pname = fmt::format("{} {} [deallocate][{}]", _kp_prefix, label, space.name); kokkosp::profiler_alloc_t<>{ pname }.store(std::plus{}, size); kokkosp::profiler_t{ pname }.mark(); } //----------------------------------------------------------------------------------// void kokkosp_begin_deep_copy(SpaceHandle dst_handle, const char* dst_name, const void* dst_ptr, SpaceHandle src_handle, const char* src_name, const void* src_ptr, uint64_t size) { if(!_kp_deep_copy || rocprofsys::config::get_use_causal()) return; if(violates_name_rules(dst_name, src_name)) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, dst_handle.name, dst_name, fmt::format("[{}]", dst_ptr), src_handle.name, src_name, fmt::format("[{}]", src_ptr), size); auto name = fmt::format("{} {} <- {} [deep_copy]", _kp_prefix, dst_handle.name, dst_name, src_handle.name, src_name); auto& _data = kokkosp::get_profiler_stack(); _data.emplace_back(name); _data.back().audit(dst_handle, dst_name, dst_ptr, src_handle, src_name, src_ptr, size); _data.back().start(); _data.back().store(tim::mpl::piecewise_select{}, std::plus{}, size); } void kokkosp_end_deep_copy() { if(!_kp_deep_copy || rocprofsys::config::get_use_causal()) return; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__); auto& _data = kokkosp::get_profiler_stack(); if(_data.empty()) return; _data.back().store(tim::mpl::piecewise_select{}, std::minus{}, 0); _data.back().stop(); _data.pop_back(); } //----------------------------------------------------------------------------------// void kokkosp_profile_event(const char* name) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); auto _name = tim::get_hash_identifier_fast(tim::add_hash_id(name)); kokkosp::profiler_t{ _name }.mark(); } //----------------------------------------------------------------------------------// void kokkosp_dual_view_sync(const char* label, const void* const, bool is_device) { if(violates_name_rules(label)) return; auto timestamp = tim::get_clock_real_now(); ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); if(rocprofsys::config::get_use_perfetto()) { auto _name = tim::get_hash_identifier_fast(tim::add_hash_id( fmt::format("{} {} [dual_view_sync]", _kp_prefix, label))); TRACE_EVENT_INSTANT("user", ::perfetto::StaticString{ _name.data() }, "target", (is_device) ? "device" : "host"); } else if(rocprofsys::config::get_use_causal()) { auto _name = tim::get_hash_identifier_fast( tim::add_hash_id(fmt::format("{} {} [dual_view_sync][{}]", _kp_prefix, label, (is_device) ? "device" : "host"))); kokkosp::profiler_t{ _name }.mark(); } cache_kokkos_event(fmt::format("{} {}", _kp_prefix, label).c_str(), "[dual_view_sync]", (is_device) ? "device" : "host", timestamp); } void kokkosp_dual_view_modify(const char* label, const void* const, bool is_device) { if(violates_name_rules(label)) return; auto timestamp = tim::get_clock_real_now(); ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); if(rocprofsys::config::get_use_perfetto()) { auto _name = tim::get_hash_identifier_fast(tim::add_hash_id( fmt::format("{} {} [dual_view_modify]", _kp_prefix, label))); TRACE_EVENT_INSTANT("user", ::perfetto::StaticString{ _name.data() }, "target", (is_device) ? "device" : "host"); } else if(rocprofsys::config::get_use_causal()) { auto _name = tim::get_hash_identifier_fast( tim::add_hash_id(fmt::format("{} {} [dual_view_modify][{}]", _kp_prefix, label, (is_device) ? "device" : "host"))); kokkosp::profiler_t{ _name }.mark(); } cache_kokkos_event(fmt::format("{} {}", _kp_prefix, label).c_str(), "[dual_view_modify]", (is_device) ? "device" : "host", timestamp); } //----------------------------------------------------------------------------------// } TIMEMORY_INITIALIZE_STORAGE(kokkosp::memory_tracker)