diff --git a/projects/rocprofiler-systems/VERSION b/projects/rocprofiler-systems/VERSION index 9ab8337f39..8fdcf38694 100644 --- a/projects/rocprofiler-systems/VERSION +++ b/projects/rocprofiler-systems/VERSION @@ -1 +1 @@ -1.9.1 +1.9.2 diff --git a/projects/rocprofiler-systems/external/timemory b/projects/rocprofiler-systems/external/timemory index 1ab76c36ef..2b92a966d7 160000 --- a/projects/rocprofiler-systems/external/timemory +++ b/projects/rocprofiler-systems/external/timemory @@ -1 +1 @@ -Subproject commit 1ab76c36ef6a13566d4bc3db5c5badc142415369 +Subproject commit 2b92a966d795d8a01476d080bcbd0573dd9fb21f diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp b/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp index 395f2b05c7..b67c3ca9b5 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp @@ -22,6 +22,7 @@ #include "avail.hpp" #include "common.hpp" +#include "common/defines.h" #include "component_categories.hpp" #include "defines.hpp" #include "enumerated_list.hpp" @@ -108,7 +109,11 @@ void write_hw_counter_info(std::ostream&, const array_t& = {}, const array_t& = {}, const array_t& = {}); -int gpu_count = 0; +namespace +{ +// initialize HIP before main so that libomnitrace is not HSA_TOOLS_LIB +int gpu_count = omnitrace::gpu::hip_device_count(); +} // namespace //--------------------------------------------------------------------------------------// @@ -174,11 +179,23 @@ main(int argc, char** argv) parser_t parser("omnitrace-avail"); - parser.enable_help(); - parser.enable_version("omnitrace-avail", "v" OMNITRACE_VERSION_STRING, - OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION); - parser.set_help_width(40); + auto _cols = std::get<0>(tim::utility::console::get_columns()); + if(_cols > parser.get_help_width() + 8) + parser.set_description_width( + std::min(_cols - parser.get_help_width() - 8, 120)); + + parser.enable_help(); + parser.enable_version("omnitrace-avail", OMNITRACE_ARGPARSE_VERSION_INFO); + + parser.add_argument({ "--monochrome" }, "Disable colorized output") + .max_count(1) + .dtype("bool") + .action([&](parser_t& p) { + auto _monochrome = p.get("monochrome"); + tim::log::monochrome() = _monochrome; + p.set_use_color(!_monochrome); + }); parser.add_argument({ "--debug" }, "Enable debug messages") .max_count(1) .action([](parser_t& p) { debug_msg = p.get("debug"); }); @@ -468,12 +485,20 @@ main(int argc, char** argv) } #if OMNITRACE_USE_HIP > 0 - // initialize HIP and call rocm_metrics() which add choices to OMNITRACE_ROCM_EVENTS - // setting - auto _status = hipGetDeviceCount(&gpu_count); - if(gpu_count > 0 && _status == hipSuccess) + if(gpu_count > 0) { - (void) omnitrace::rocprofiler::rocm_metrics(); + size_t _num_metrics = 0; + try + { + // call to rocm_metrics() will add choices to OMNITRACE_ROCM_EVENTS setting + // so always perform this call even if list of HW counters is not requested + _num_metrics = omnitrace::rocprofiler::rocm_metrics().size(); + } catch(std::runtime_error& _e) + { + verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what()); + } + verbprintf(0, "Found %i HIP devices and %zu GPU HW counters\n", gpu_count, + _num_metrics); } else { diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-avail/common.hpp b/projects/rocprofiler-systems/source/bin/omnitrace-avail/common.hpp index ab57cad95f..e21335ca15 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-avail/common.hpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-avail/common.hpp @@ -116,6 +116,8 @@ extern std::string settings_rexclude_exact; // leading matches, e.g. OMNITRACE_MPI_[A-Z_]+ extern std::string settings_rexclude_begin; +constexpr size_t max_error_message_buffer_length = 4096; + //--------------------------------------------------------------------------------------// // functions @@ -149,18 +151,27 @@ file_exists(const std::string&); // control debug printf statements #define errprintf(LEVEL, ...) \ { \ - if(werror || LEVEL < 0) \ + if(LEVEL < verbose_level) \ { \ if(debug_msg || verbose_level >= LEVEL) \ + { \ + fprintf(stderr, "%s", tim::log::color::fatal()); \ fprintf(stderr, "[omnitrace][avail] Error! " __VA_ARGS__); \ - char _buff[FUNCNAMELEN]; \ - sprintf(_buff, "[omnitrace][avail] Error! " __VA_ARGS__); \ + fprintf(stderr, "%s", tim::log::color::end()); \ + } \ + char _buff[max_error_message_buffer_length]; \ + snprintf(_buff, max_error_message_buffer_length, \ + "[omnitrace][avail] Error! " __VA_ARGS__); \ throw std::runtime_error(std::string{ _buff }); \ } \ else \ { \ if(debug_msg || verbose_level >= LEVEL) \ + { \ + fprintf(stderr, "%s", tim::log::color::warning()); \ fprintf(stderr, "[omnitrace][avail] Warning! " __VA_ARGS__); \ + fprintf(stderr, "%s", tim::log::color::end()); \ + } \ } \ fflush(stderr); \ } @@ -169,12 +180,19 @@ file_exists(const std::string&); #define verbprintf(LEVEL, ...) \ { \ if(debug_msg || verbose_level >= LEVEL) \ + { \ + fprintf(stderr, "%s", tim::log::color::info()); \ fprintf(stderr, "[omnitrace][avail] " __VA_ARGS__); \ + fprintf(stderr, "%s", tim::log::color::end()); \ + } \ fflush(stderr); \ } #define verbprintf_bare(LEVEL, ...) \ { \ - if(debug_msg || verbose_level >= LEVEL) fprintf(stderr, __VA_ARGS__); \ + if(debug_msg || verbose_level >= LEVEL) \ + { \ + fprintf(stderr, __VA_ARGS__); \ + } \ fflush(stderr); \ } diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-causal/impl.cpp b/projects/rocprofiler-systems/source/bin/omnitrace-causal/impl.cpp index 5422809c2a..355ebcd19c 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-causal/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-causal/impl.cpp @@ -549,8 +549,7 @@ parse_args(int argc, char** argv, std::vector& _env, }); parser.enable_help(); - parser.enable_version("omnitrace-causal", "v" OMNITRACE_VERSION_STRING, - OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION); + parser.enable_version("omnitrace-causal", OMNITRACE_ARGPARSE_VERSION_INFO); auto _cols = std::get<0>(console::get_columns()); if(_cols > parser.get_help_width() + 8) diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-instrument/omnitrace-instrument.cpp b/projects/rocprofiler-systems/source/bin/omnitrace-instrument/omnitrace-instrument.cpp index ba4f80ccc2..13bfc2dcdb 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-instrument/omnitrace-instrument.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-instrument/omnitrace-instrument.cpp @@ -461,8 +461,7 @@ main(int argc, char** argv) string_t extra_help = "-- "; parser.enable_help(); - parser.enable_version("omnitrace", "v" OMNITRACE_VERSION_STRING, - OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION); + parser.enable_version("omnitrace-instrument", OMNITRACE_ARGPARSE_VERSION_INFO); parser.add_argument({ "" }, ""); parser.add_argument({ "[DEBUG OPTIONS]" }, ""); diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-run/impl.cpp b/projects/rocprofiler-systems/source/bin/omnitrace-run/impl.cpp index 71a0a163fb..44239498cd 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-run/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-run/impl.cpp @@ -292,8 +292,7 @@ parse_args(int argc, char** argv, parser_data_t& _parser_data) }); parser.enable_help("", "Usage: omnitrace-run -- "); - parser.enable_version("omnitrace-run", "v" OMNITRACE_VERSION_STRING, - OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION); + parser.enable_version("omnitrace-run", OMNITRACE_ARGPARSE_VERSION_INFO); auto _cols = std::get<0>(console::get_columns()); if(_cols > parser.get_help_width() + 8) diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-sample/impl.cpp b/projects/rocprofiler-systems/source/bin/omnitrace-sample/impl.cpp index 797a5f99d0..54ddf24282 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-sample/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-sample/impl.cpp @@ -369,8 +369,7 @@ parse_args(int argc, char** argv, std::vector& _env) parser.set_use_color(true); parser.enable_help(); - parser.enable_version("omnitrace-sample", "v" OMNITRACE_VERSION_STRING, - OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION); + parser.enable_version("omnitrace-sample", OMNITRACE_ARGPARSE_VERSION_INFO); auto _cols = std::get<0>(tim::utility::console::get_columns()); if(_cols > parser.get_help_width() + 8) diff --git a/projects/rocprofiler-systems/source/lib/common/defines.h.in b/projects/rocprofiler-systems/source/lib/common/defines.h.in index 2a8f87c902..2784bb58e0 100644 --- a/projects/rocprofiler-systems/source/lib/common/defines.h.in +++ b/projects/rocprofiler-systems/source/lib/common/defines.h.in @@ -30,6 +30,17 @@ #define OMNITRACE_GIT_DESCRIBE "@OMNITRACE_GIT_DESCRIBE@" #define OMNITRACE_GIT_REVISION "@OMNITRACE_GIT_REVISION@" +// system info during compilation +#define OMNITRACE_LIBRARY_ARCH "@CMAKE_LIBRARY_ARCHITECTURE@" +#define OMNITRACE_SYSTEM_NAME "@CMAKE_SYSTEM_NAME@" +#define OMNITRACE_SYSTEM_PROCESSOR "@CMAKE_SYSTEM_PROCESSOR@" +#define OMNITRACE_SYSTEM_VERSION "@CMAKE_SYSTEM_VERSION@" + +// compiler information +#define OMNITRACE_COMPILER_ID "@CMAKE_CXX_COMPILER_ID@" +#define OMNITRACE_COMPILER_VERSION "@CMAKE_CXX_COMPILER_VERSION@" +#define OMNITRACE_COMPILER_STRING OMNITRACE_COMPILER_ID " v" OMNITRACE_COMPILER_VERSION + #define OMNITRACE_DEFAULT_ROCM_PATH "@ROCmVersion_DIR@" #define OMNITRACE_HIP_VERSION_STRING "@OMNITRACE_HIP_VERSION@" #define OMNITRACE_HIP_VERSION_MAJOR @OMNITRACE_HIP_VERSION_MAJOR@ @@ -45,6 +56,27 @@ ((10000 * OMNITRACE_HIP_VERSION_MAJOR) + (100 * OMNITRACE_HIP_VERSION_MINOR) + \ OMNITRACE_HIP_VERSION_PATCH) +#if OMNITRACE_HIP_VERSION_MAJOR > 0 +# define OMNITRACE_HIP_VERSION_COMPAT_STRING \ + "v@OMNITRACE_HIP_VERSION_MAJOR@.@OMNITRACE_HIP_VERSION_MINOR@.x" +#else +# define OMNITRACE_HIP_VERSION_COMPAT_STRING "" +#endif + +// this should be passed to argparse::argument_parser::enable_version +// Example: +// parser.enable_version(, OMNITRACE_ARGPARSE_VERSION_INFO); +#if !defined(OMNITRACE_ARGPARSE_VERSION_INFO) +# define OMNITRACE_ARGPARSE_VERSION_INFO \ + "v" OMNITRACE_VERSION_STRING, OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION, \ + { \ + { "", OMNITRACE_LIBRARY_ARCH }, { "compiler", OMNITRACE_COMPILER_STRING }, \ + { \ + "rocm", OMNITRACE_HIP_VERSION_COMPAT_STRING \ + } \ + } +#endif + // clang-format off #if !defined(OMNITRACE_MAX_THREADS) # define OMNITRACE_MAX_THREADS @OMNITRACE_MAX_THREADS@ diff --git a/projects/rocprofiler-systems/source/lib/core/config.cpp b/projects/rocprofiler-systems/source/lib/core/config.cpp index d9ce292668..0acc205523 100644 --- a/projects/rocprofiler-systems/source/lib/core/config.cpp +++ b/projects/rocprofiler-systems/source/lib/core/config.cpp @@ -255,6 +255,14 @@ configure_settings(bool _init) tim::manager::add_metadata("OMNITRACE_GIT_DESCRIBE", OMNITRACE_GIT_DESCRIBE); tim::manager::add_metadata("OMNITRACE_GIT_REVISION", OMNITRACE_GIT_REVISION); + tim::manager::add_metadata("OMNITRACE_LIBRARY_ARCH", OMNITRACE_LIBRARY_ARCH); + tim::manager::add_metadata("OMNITRACE_SYSTEM_NAME", OMNITRACE_SYSTEM_NAME); + tim::manager::add_metadata("OMNITRACE_SYSTEM_PROCESSOR", OMNITRACE_SYSTEM_PROCESSOR); + tim::manager::add_metadata("OMNITRACE_SYSTEM_VERSION", OMNITRACE_SYSTEM_VERSION); + + tim::manager::add_metadata("OMNITRACE_COMPILER_ID", OMNITRACE_COMPILER_ID); + tim::manager::add_metadata("OMNITRACE_COMPILER_VERSION", OMNITRACE_COMPILER_VERSION); + #if OMNITRACE_HIP_VERSION > 0 tim::manager::add_metadata("OMNITRACE_HIP_VERSION", OMNITRACE_HIP_VERSION_STRING); tim::manager::add_metadata("OMNITRACE_HIP_VERSION_MAJOR", @@ -1511,38 +1519,37 @@ print_banner(std::ostream& _os) \______/ |__| |__| |__| \__| |__| |__| | _| `._____/__/ \__\ \______||_______| )banner"; - auto _tag = std::string_view{ OMNITRACE_GIT_DESCRIBE }; - auto _rev = std::string_view{ OMNITRACE_GIT_REVISION }; -#if OMNITRACE_HIP_VERSION_MAJOR > 0 - auto _hip = JOIN('.', OMNITRACE_HIP_VERSION_MAJOR, OMNITRACE_HIP_VERSION_MINOR, "x"); -#else - auto _hip = std::string_view{}; -#endif std::stringstream _version_info{}; _version_info << "omnitrace v" << OMNITRACE_VERSION_STRING; - if(!_tag.empty() || !_rev.empty() || !_hip.empty()) - { - _version_info << " ("; - if(!_tag.empty()) - { - _version_info << "tag: " << OMNITRACE_GIT_DESCRIBE; - if(!_rev.empty()) _version_info << ", "; - } - if(!_rev.empty()) - { - _version_info << "rev: " << OMNITRACE_GIT_REVISION; - if(!_hip.empty()) _version_info << ", "; - } + namespace join = ::timemory::join; - if(!_hip.empty()) - { - _version_info << "rocm: " << _hip; - } - } + // assemble the list of properties + auto _generate_properties = + [](std::initializer_list>&& _data) { + auto _property_info = std::vector{}; + _property_info.reserve(_data.size()); + for(const auto& itr : _data) + { + if(!itr.second.empty()) + _property_info.emplace_back( + itr.first.empty() ? itr.second + : join::join(": ", itr.first, itr.second)); + } + return _property_info; + }; - if(!_version_info.str().empty()) _version_info << ")"; + auto _properties = + _generate_properties({ { "rev", OMNITRACE_GIT_REVISION }, + { "tag", OMNITRACE_GIT_DESCRIBE }, + { "", OMNITRACE_LIBRARY_ARCH }, + { "compiler", OMNITRACE_COMPILER_STRING }, + { "rocm", OMNITRACE_HIP_VERSION_COMPAT_STRING } }); + + // () + if(!_properties.empty()) + _version_info << join::join(join::array_config{ ", ", " (", ")" }, _properties); tim::log::stream(_os, tim::log::color::info()) << _banner << _version_info.str(); _os << std::endl; diff --git a/projects/rocprofiler-systems/source/lib/core/perfetto.cpp b/projects/rocprofiler-systems/source/lib/core/perfetto.cpp index c5da6a5fbc..ef7fa01151 100644 --- a/projects/rocprofiler-systems/source/lib/core/perfetto.cpp +++ b/projects/rocprofiler-systems/source/lib/core/perfetto.cpp @@ -124,12 +124,12 @@ start() } else { - OMNITRACE_VERBOSE(0, "Resuming perfetto...\n"); + OMNITRACE_VERBOSE(2, "Resuming perfetto...\n"); _tmp_file->fopen("a+"); } } - OMNITRACE_VERBOSE(0, "Setup perfetto...\n"); + OMNITRACE_VERBOSE(2, "Setup perfetto...\n"); int _fd = (_tmp_file) ? _tmp_file->fd : -1; auto& cfg = get_config(); tracing_session->Setup(cfg, _fd); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/components/backtrace.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/components/backtrace.hpp index eb85aa21a7..c9b78aa8ba 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/components/backtrace.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/components/backtrace.hpp @@ -45,17 +45,13 @@ namespace causal { namespace component { -struct sample_rate -: tim::component::empty_base -, tim::concepts::component +struct sample_rate : comp::empty_base { using value_type = void; static void sample(int = -1); }; -struct backtrace -: tim::component::empty_base -, tim::concepts::component +struct backtrace : comp::empty_base { using value_type = void; using sample_data_set_t = std::set; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/delay.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/delay.hpp index af56174aee..6f57d63339 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/delay.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/causal/delay.hpp @@ -38,9 +38,7 @@ namespace omnitrace { namespace causal { -struct delay -: tim::component::empty_base -, tim::concepts::component +struct delay : comp::empty_base { using value_type = void; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp index 9652b67468..33a1316967 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp @@ -45,9 +45,7 @@ namespace omnitrace { namespace component { -struct backtrace -: tim::component::empty_base -, tim::concepts::component +struct backtrace : comp::empty_base { static constexpr size_t stack_depth = OMNITRACE_MAX_UNWIND_DEPTH; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_metrics.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_metrics.hpp index d9f6f6e540..223b74c820 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_metrics.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_metrics.hpp @@ -51,9 +51,7 @@ using type_list = ::tim::type_list; namespace component { -struct backtrace_metrics -: tim::component::empty_base -, concepts::component +struct backtrace_metrics : comp::empty_base { static constexpr size_t num_hw_counters = TIMEMORY_PAPI_ARRAY_SIZE; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_timestamp.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_timestamp.hpp index fc49d8e757..69fa2af525 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_timestamp.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace_timestamp.hpp @@ -38,9 +38,7 @@ namespace omnitrace { namespace component { -struct backtrace_timestamp -: tim::component::empty_base -, tim::concepts::component +struct backtrace_timestamp : comp::empty_base { using value_type = void; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/cpu_freq.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/cpu_freq.hpp index d352613b53..66904bf8c9 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/cpu_freq.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/cpu_freq.hpp @@ -34,12 +34,11 @@ namespace omnitrace namespace component { struct cpu_freq -: tim::concepts::component -, tim::component::empty_base +: comp::empty_base , tim::component::base_format , tim::component::base_data, 1> { - using base_type = tim::component::empty_base; + using base_type = comp::empty_base; using this_type = cpu_freq; using value_type = std::vector; using storage_type = tim::storage; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm.cpp index 3e6b9683cc..4d9e9a2087 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm.cpp @@ -127,6 +127,8 @@ extern "C" void OnLoadToolProp(rocprofiler_settings_t* settings) { + using ::rocprofiler::util::HsaRsrcFactory; + OMNITRACE_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n"); rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock }; @@ -332,6 +334,8 @@ extern "C" } else { + using ::rocprofiler::util::HsaRsrcFactory; + HsaRsrcFactory::Instance().PrintGpuAgents("ROCm"); } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.cpp index b11954f6cc..705a508d5f 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.cpp @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include +#include #include #include #include @@ -43,8 +44,23 @@ THE SOFTWARE. #include #include #include +#include #include +namespace rocprofiler +{ +namespace util +{ +// Demangle C++ symbol name +static const char* +cpp_demangle(const char* symname) +{ + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, nullptr, &size, &status); + return (ret != nullptr) ? ret : strdup(symname); +} + // Callback function to get available in the system agents hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) @@ -168,10 +184,20 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) CHECK_STATUS("HSA timer allocation failed", (timer_ == nullptr) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + // Time correlation + const uint32_t corr_iters = 1000; + for(unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1) + { + CorrelateTime((HsaTimer::time_id_t) time_id, corr_iters); + } + // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); + + // To dump code objects + to_dump_code_obj_ = getenv("ROCP_DUMP_CODEOBJ"); } // Destructor of the class @@ -214,12 +240,12 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; - hsa_api_.hsa_queue_load_write_index_relaxed = - table->core_->hsa_queue_load_write_index_relaxed_fn; - hsa_api_.hsa_queue_store_write_index_relaxed = - table->core_->hsa_queue_store_write_index_relaxed_fn; hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = + table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_add_write_index_scacq_screl = + table->core_->hsa_queue_add_write_index_scacq_screl_fn; hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; @@ -236,7 +262,8 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; - hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; hsa_api_.hsa_executable_symbol_get_info = @@ -277,12 +304,12 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) hsa_api_.hsa_queue_create = hsa_queue_create; hsa_api_.hsa_queue_destroy = hsa_queue_destroy; - hsa_api_.hsa_queue_load_write_index_relaxed = - hsa_queue_load_write_index_relaxed; - hsa_api_.hsa_queue_store_write_index_relaxed = - hsa_queue_store_write_index_relaxed; hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + hsa_api_.hsa_queue_load_write_index_relaxed = + hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_add_write_index_scacq_screl = + hsa_queue_add_write_index_scacq_screl; hsa_api_.hsa_signal_create = hsa_signal_create; hsa_api_.hsa_signal_destroy = hsa_signal_destroy; @@ -297,6 +324,7 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_destroy = hsa_executable_destroy; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; @@ -400,8 +428,10 @@ HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_GPU; hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); - strncpy(agent_info->gfxip, agent_info->name, 4); - agent_info->gfxip[4] = '\0'; + const int gfxip_label_len = + std::min(strlen(agent_info->name) - 2, sizeof(agent_info->gfxip) - 1); + memcpy(agent_info->gfxip, agent_info->name, gfxip_label_len); + agent_info->gfxip[gfxip_label_len] = '\0'; hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, @@ -437,7 +467,12 @@ HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) agent_info->vgpr_block_size = 4; // Set GPU index - agent_info->dev_index = gpu_list_.size(); + uint32_t driver_node_id; + status = hsa_api_.hsa_agent_get_info( + agent, static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), + &driver_node_id); + CHECK_STATUS("hsa_agent_get_info(gpu hsa_driver_node_id)", status); + agent_info->dev_index = driver_node_id; gpu_list_.push_back(agent_info); gpu_agents_.push_back(agent); } @@ -646,22 +681,27 @@ HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) } // Wait signal -void -HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const +hsa_signal_value_t +HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, + const hsa_signal_value_t& signal_value) const { + const hsa_signal_value_t exp_value = signal_value - 1; + hsa_signal_value_t ret_value = signal_value; while(true) { - const hsa_signal_value_t signal_value = hsa_api_.hsa_signal_wait_scacquire( - signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); - if(signal_value == 0) + ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, + signal_value, timeout_, + HSA_WAIT_STATE_BLOCKED); + if(ret_value == exp_value) break; + if(ret_value != signal_value) { - break; - } - else - { - CHECK_STATUS("hsa_signal_wait_scacquire()", HSA_STATUS_ERROR); + std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" + << signal_value << "), ret_value(" << ret_value << ")" << std::endl + << std::flush; + abort(); } } + return ret_value; } // Wait signal with signal value restore @@ -669,7 +709,7 @@ void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { - SignalWait(signal); + SignalWait(signal, signal_value); hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } @@ -686,7 +726,7 @@ HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, siz status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, nullptr, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); - SignalWait(s); + SignalWait(s, 1); status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } @@ -766,6 +806,8 @@ HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_pa agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); + close(file_handle); + // Update output parameter *code_desc = kernelSymbol; return true; @@ -808,20 +850,36 @@ HsaRsrcFactory::PrintGpuAgents(const std::string&) return true; } +void* +HsaRsrcFactory::GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx) +{ + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; + const uint32_t slot_idx = (uint32_t)(idx % queue->size); + void* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + + (slot_idx * slot_size_b)); + return queue_slot; +} + +void* +HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) +{ + const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue); + return GetSlotPointer(queue, read_idx); +} + uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue - const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); - hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1); while((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } - uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + const uint32_t slot_idx = (uint32_t)(write_idx % queue->size); uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); const uint32_t* slot_data = reinterpret_cast(packet); @@ -862,29 +920,32 @@ HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes } const char* -HsaRsrcFactory::GetKernelName(uint64_t addr) +HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { std::lock_guard lck(mutex_); const auto it = symbols_map_->find(addr); if(it == symbols_map_->end()) { - fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + fprintf(stderr, + "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", + addr); abort(); } - return strdup(it->second); + return it->second; } void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { std::lock_guard lck(mutex_); - executable_tracking_on_ = true; - table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor; } hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t /*exec*/, - hsa_executable_symbol_t symbol, void* /*data*/) + hsa_executable_symbol_t symbol, void* data) { hsa_symbol_kind_t value = (hsa_symbol_kind_t) 0; hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info( @@ -900,17 +961,26 @@ HsaRsrcFactory::executable_symbols_cb(hsa_executable_t /*exec*/, status = hsa_api_.hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); CHECK_STATUS("Error in getting name len", status); - char* name = new char[len + 1]; - status = hsa_api_.hsa_executable_symbol_get_info( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + char* symname = new char[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symname); CHECK_STATUS("Error in getting kernel name", status); - name[len] = 0; - auto ret = symbols_map_->insert({ addr, name }); - if(ret.second == false) + symname[len] = 0; + if(data == nullptr) { - delete[] ret.first->second; - ret.first->second = name; + const char* name = cpp_demangle(symname); + auto ret = symbols_map_->insert({ addr, name }); + if(ret.second == false) + { + delete[] ret.first->second; + ret.first->second = name; + } } + else + { + symbols_map_->erase(addr); + } + delete[] symname; } return HSA_STATUS_SUCCESS; } @@ -925,7 +995,19 @@ HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, executable, executable_symbols_cb, nullptr); CHECK_STATUS("Error in iterating executable symbols", status); return hsa_api_.hsa_executable_freeze(executable, options); - ; +} + +hsa_status_t +HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) +{ + std::lock_guard lck(mutex_); + if(symbols_map_ != nullptr) + { + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols( + executable, executable_symbols_cb, (void*) 1); + CHECK_STATUS("Error in iterating executable symbols", status); + } + return hsa_api_.hsa_executable_destroy(executable); } std::atomic HsaRsrcFactory::instance_{}; @@ -934,3 +1016,7 @@ HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP hsa_pfn_t HsaRsrcFactory::hsa_api_{}; bool HsaRsrcFactory::executable_tracking_on_ = false; HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = nullptr; +void* HsaRsrcFactory::to_dump_code_obj_ = nullptr; + +} // namespace util +} // namespace rocprofiler diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.hpp index f0bfd22b28..2d68b5ec68 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocm/hsa_rsrc_factory.hpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -77,6 +78,10 @@ } \ } while(0) +namespace rocprofiler +{ +namespace util +{ static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; @@ -88,11 +93,12 @@ struct hsa_pfn_t decltype(::hsa_agent_get_info)* hsa_agent_get_info; decltype(::hsa_iterate_agents)* hsa_iterate_agents; - decltype(::hsa_queue_create)* hsa_queue_create; - decltype(::hsa_queue_destroy)* hsa_queue_destroy; - decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; - decltype(::hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; - decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(::hsa_queue_create)* hsa_queue_create; + decltype(::hsa_queue_destroy)* hsa_queue_destroy; + decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype( + ::hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl; decltype(::hsa_signal_create)* hsa_signal_create; decltype(::hsa_signal_destroy)* hsa_signal_destroy; @@ -107,6 +113,7 @@ struct hsa_pfn_t decltype( ::hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(::hsa_executable_freeze)* hsa_executable_freeze; + decltype(::hsa_executable_destroy)* hsa_executable_destroy; decltype(::hsa_executable_get_symbol)* hsa_executable_get_symbol; decltype(::hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; decltype(::hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; @@ -180,10 +187,11 @@ struct AgentInfo // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; - // SGPR/VGPR block sizes - uint32_t sgpr_block_dflt; - uint32_t sgpr_block_size; - uint32_t vgpr_block_size; + // SGPR/VGPR/LDS block sizes + uint32_t sgpr_block_dflt; + uint32_t sgpr_block_size; + uint32_t vgpr_block_size; + static const uint32_t lds_block_size = 128 * 4; }; // HSA timer class @@ -195,6 +203,16 @@ public: static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; + enum time_id_t + { + TIME_ID_CLOCK_REALTIME = 0, + TIME_ID_CLOCK_REALTIME_COARSE = 1, + TIME_ID_CLOCK_MONOTONIC = 2, + TIME_ID_CLOCK_MONOTONIC_COARSE = 3, + TIME_ID_CLOCK_MONOTONIC_RAW = 4, + TIME_ID_NUMBER + }; + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { @@ -215,6 +233,12 @@ public: return timestamp_t((freq_t) time / sysclock_factor_); } + // Method for timespec/ns conversion + static timestamp_t timespec_to_ns(const timespec& time) + { + return ((timestamp_t) time.tv_sec * 1000000000) + time.tv_nsec; + } + // Return timestamp in 'ns' timestamp_t timestamp_ns() const { @@ -225,6 +249,57 @@ public: return sysclock_to_ns(sysclock); } + // Return time in 'ns' + timestamp_t clocktime_ns(clockid_t clock_id) const + { + timespec time; + clock_gettime(clock_id, &time); + return timespec_to_ns(time); + } + + // Return pair of correlated values of profiling timestamp and time with + // correlation error for a given time ID and number of iterations + void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v, + timestamp_t* time_v, timestamp_t* error_v) + { + clockid_t clock_id = 0; + switch(time_id) + { + case TIME_ID_CLOCK_REALTIME: clock_id = CLOCK_REALTIME; break; + case TIME_ID_CLOCK_REALTIME_COARSE: clock_id = CLOCK_REALTIME_COARSE; break; + case TIME_ID_CLOCK_MONOTONIC: clock_id = CLOCK_MONOTONIC; break; + case TIME_ID_CLOCK_MONOTONIC_COARSE: clock_id = CLOCK_MONOTONIC_COARSE; break; + case TIME_ID_CLOCK_MONOTONIC_RAW: clock_id = CLOCK_MONOTONIC_RAW; break; + default: CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); + } + + std::vector ts_vec(iters); + std::vector tm_vec(iters); + const uint32_t steps = iters - 1; + + for(uint32_t i = 0; i < iters; ++i) + { + hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); + clock_gettime(clock_id, &tm_vec[i]); + } + + const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); + const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); + const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); + + timestamp_t ts_accum = 0; + timestamp_t tm_accum = 0; + for(uint32_t i = 0; i < iters; ++i) + { + ts_accum += (ts_vec[i] - ts_base); + tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); + } + + *timestamp_v = (ts_accum / iters) + ts_base + error; + *time_v = (tm_accum / iters) + tm_base; + *error_v = error; + } + private: // Timestamp frequency factor freq_t sysclock_factor_; @@ -332,7 +407,8 @@ public: uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); // Wait signal - void SignalWait(const hsa_signal_t& signal) const; + hsa_signal_value_t SignalWait(const hsa_signal_t& signal, + const hsa_signal_value_t& signal_value) const; // Wait signal with signal value restore void SignalWaitRestore(const hsa_signal_t& signal, @@ -359,14 +435,16 @@ public: // Print the various fields of Hsa Gpu Agents bool PrintGpuAgents(const std::string& header); - // Submit AQL packet to given queue + // Utils for submitting AQL packet to a given queue + static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx); + static void* GetReadPointer(hsa_queue_t* queue); static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); // Enable executables loading tracking static bool IsExecutableTracking() { return executable_tracking_on_; } static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelName(uint64_t addr); + static const char* GetKernelNameRef(uint64_t addr); // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); @@ -400,6 +478,29 @@ public: Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } + void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) + { + timestamp_t timestamp_v = 0; + timestamp_t time_v = 0; + timestamp_t error_v = 0; + timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); + time_shift_[time_id] = time_v - timestamp_v; + time_error_[time_id] = error_v; + } + + hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value) + { + if(time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *time_value = time_stamp + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err) + { + *err = time_error_[time_id]; + return HSA_STATUS_SUCCESS; + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -444,8 +545,10 @@ private: typedef std::map symbols_map_t; static symbols_map_t* symbols_map_; static bool executable_tracking_on_; + static void* to_dump_code_obj_; static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char* options); + static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable); static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void* data); @@ -466,7 +569,14 @@ private: // HSA timer HsaTimer* timer_; + // Time shift array to support time conversion + timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; + timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; + // CPU/kern-arg memory pools hsa_amd_memory_pool_t* cpu_pool_; hsa_amd_memory_pool_t* kern_arg_pool_; }; + +} // namespace util +} // namespace rocprofiler diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocprofiler.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocprofiler.cpp index 174c015a52..e80bbd56e2 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/rocprofiler.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/rocprofiler.cpp @@ -56,6 +56,9 @@ namespace rocprofiler { namespace { +using ::rocprofiler::util::AgentInfo; +using ::rocprofiler::util::HsaRsrcFactory; + auto& get_event_names() { @@ -271,30 +274,25 @@ metrics_input(unsigned _device, rocprofiler_feature_t** ret) return feature_count; } -struct info_data -{ - const AgentInfo* agent = nullptr; - std::vector* data = nullptr; -}; +using info_data = std::vector; hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void* arg) { using qualifier_t = tim::hardware_counters::qualifier; using qualifier_vec_t = std::vector; - auto* _arg = static_cast(arg); - const auto* _agent = _arg->agent; - auto* _data = _arg->data; + auto* _data = static_cast(arg); + auto _dev_index = info.agent_index; switch(info.kind) { case ROCPROFILER_INFO_KIND_METRIC: { - auto _device_qualifier_sym = JOIN("", ":device=", _agent->dev_index); - auto _device_qualifier = tim::hardware_counters::qualifier{ - true, static_cast(_agent->dev_index), _device_qualifier_sym, - JOIN(" ", "Device", _agent->dev_index) - }; + auto _device_qualifier_sym = JOIN("", ":device=", _dev_index); + auto _device_qualifier = + tim::hardware_counters::qualifier{ true, static_cast(_dev_index), + _device_qualifier_sym, + JOIN(" ", "Device", _dev_index) }; auto _long_desc = std::string{ info.metric.description }; auto _units = std::string{}; auto _pysym = std::string{}; @@ -313,7 +311,7 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg) { auto _sym = JOIN("", info.metric.name, _device_qualifier_sym); auto _short_desc = - JOIN("", info.metric.name, " on device ", _agent->dev_index); + JOIN("", info.metric.name, " on device ", _dev_index); _data->emplace_back(component::rocm_info_entry( true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym, _pysym, _short_desc, _long_desc, _units, @@ -331,7 +329,7 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg) auto _sym = JOIN("", info.metric.name, _instance_qualifier_sym, _device_qualifier_sym); auto _short_desc = JOIN("", info.metric.name, " instance ", i, - " on device ", _agent->dev_index); + " on device ", _dev_index); _data->emplace_back(component::rocm_info_entry( true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym, _pysym, _short_desc, _long_desc, _units, @@ -369,12 +367,26 @@ rocm_metrics() const AgentInfo** _agent_p = &_agent; HsaRsrcFactory::Instance().GetGpuAgentInfo(i, _agent_p); - auto _v = info_data{ _agent, &_data }; - if(!rocm_check_status( - rocprofiler_iterate_info(&_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC, - info_data_callback, reinterpret_cast(&_v)), - { HSA_STATUS_ERROR_NOT_INITIALIZED })) - return _data; + if(!rocm_check_status(rocprofiler_iterate_info( + &_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, reinterpret_cast(&_data)), + { HSA_STATUS_ERROR_NOT_INITIALIZED })) + { + OMNITRACE_WARNING_F(-1, "rocprofiler_iterate_info failed for gpu agent %u\n", + i); + } + } + + if(gpu_count > 0 && _data.empty()) + { + if(!rocm_check_status(rocprofiler_iterate_info( + nullptr, ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, reinterpret_cast(&_data)), + { HSA_STATUS_ERROR_NOT_INITIALIZED })) + { + OMNITRACE_WARNING_F(-1, "rocprofiler_iterate_info failed for %i gpu agents\n", + gpu_count); + } } auto _settings = tim::settings::shared_instance();