diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py index 806fc454bb..7c621f760b 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py @@ -546,6 +546,23 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins nargs="*", ) + advanced_options.add_argument( + "--rocm-root", + help="Use the given path as the root ROCm path instead of the relative path of this script", + type=str, + metavar="PATH", + default=None, + ) + add_parser_bool_argument( + advanced_options, + "--readlink", + help=argparse.SUPPRESS, + ) + add_parser_bool_argument( + advanced_options, + "--realpath", + help=argparse.SUPPRESS, + ) # below is available for CI because LD_PRELOADing a library linked to a sanitizer library # causes issues in apps where HIP is part of shared library. add_parser_bool_argument( @@ -874,6 +891,8 @@ def run(app_args, args, **kwargs): ROCPROFV3_DIR = os.path.dirname(os.path.realpath(__file__)) ROCM_DIR = os.path.dirname(ROCPROFV3_DIR) + if args.rocm_root is not None: + ROCM_DIR = os.path.abspath(args.rocm_root) ROCPROF_TOOL_LIBRARY = f"{ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" ROCPROF_SDK_LIBRARY = f"{ROCM_DIR}/lib/librocprofiler-sdk.so" ROCPROF_ROCTX_LIBRARY = f"{ROCM_DIR}/lib/librocprofiler-sdk-roctx.so" @@ -884,6 +903,22 @@ def run(app_args, args, **kwargs): f"{ROCM_DIR}/libexec/rocprofiler-sdk/librocprofv3-list-avail.so" ) + def resolve_path(val): + if not os.path.exists(val): + fatal_error(f"{val} does not exist") + if os.path.islink(val): + if args.readlink: + val = os.path.abspath(os.readlink(val)) + if args.realpath: + val = os.path.realpath(val) + return val + + ROCPROF_TOOL_LIBRARY = resolve_path(ROCPROF_TOOL_LIBRARY) + ROCPROF_SDK_LIBRARY = resolve_path(ROCPROF_SDK_LIBRARY) + ROCPROF_ROCTX_LIBRARY = resolve_path(ROCPROF_ROCTX_LIBRARY) + ROCPROF_KOKKOSP_LIBRARY = resolve_path(ROCPROF_KOKKOSP_LIBRARY) + ROCPROF_LIST_AVAIL_TOOL_LIBRARY = resolve_path(ROCPROF_LIST_AVAIL_TOOL_LIBRARY) + prepend_preload = [itr for itr in args.preload if itr] append_preload = [ ROCPROF_TOOL_LIBRARY, diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3_avail.py b/projects/rocprofiler-sdk/source/bin/rocprofv3_avail.py old mode 100644 new mode 100755 index 98b6f8edc2..64a5a166bd --- a/projects/rocprofiler-sdk/source/bin/rocprofv3_avail.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3_avail.py @@ -69,10 +69,21 @@ class pc_config: self.max_interval = max_interval +ROCPROFV3_AVAIL_DIR = os.path.dirname(os.path.realpath(__file__)) +ROCM_DIR = os.path.dirname(ROCPROFV3_AVAIL_DIR) +ROCPROF_LIST_AVAIL_TOOL_LIBRARY = ( + f"{ROCM_DIR}/libexec/rocprofiler-sdk/librocprofv3-list-avail.so" +) + MAX_STR = 256 -libname = os.environ.get("ROCPROF_LIST_AVAIL_TOOL_LIBRARY") +libname = os.environ.get( + "ROCPROF_LIST_AVAIL_TOOL_LIBRARY", ROCPROF_LIST_AVAIL_TOOL_LIBRARY +) c_lib = ctypes.CDLL(libname) +if c_lib is None: + fatal_error(f"Error opening {libname}") + c_lib.get_number_of_counters.restype = ctypes.c_ulong c_lib.get_number_of_pc_sample_configs.restype = ctypes.c_ulong c_lib.get_number_of_dimensions.restype = ctypes.c_ulong diff --git a/projects/rocprofiler-sdk/source/lib/common/container/ring_buffer.cpp b/projects/rocprofiler-sdk/source/lib/common/container/ring_buffer.cpp index fd4f3155f4..f42bd339c2 100644 --- a/projects/rocprofiler-sdk/source/lib/common/container/ring_buffer.cpp +++ b/projects/rocprofiler-sdk/source/lib/common/container/ring_buffer.cpp @@ -24,6 +24,8 @@ #include "lib/common/environment.hpp" #include "lib/common/units.hpp" +#include + #include #include #include @@ -71,9 +73,8 @@ ring_buffer::operator=(ring_buffer&& rhs) noexcept void ring_buffer::init(size_t _size) { - if(m_init) - throw std::runtime_error("rocprofiler::common::container::base::ring_buffer::init(size_t) " - ":: already initialized"); + ROCP_FATAL_IF(m_init) + << "rocprofiler::common::container::base::ring_buffer::init(size_t) :: already initialized"; m_init = true; @@ -85,9 +86,10 @@ ring_buffer::init(size_t _size) if((_size % units::get_page_size()) > 0) { std::ostringstream _oss{}; - _oss << "Error! size is not a multiple of page size: " << _size << " % " - << units::get_page_size() << " = " << (_size % units::get_page_size()); - throw std::runtime_error(_oss.str()); + ROCP_FATAL << fmt::format("Error! size is not a multiple of page size: {} % {} = {}", + _size, + units::get_page_size(), + (_size % units::get_page_size())); } m_size = _size; @@ -101,7 +103,7 @@ ring_buffer::init(size_t _size) { destroy(); auto _err = errno; - throw std::runtime_error(strerror(_err)); + ROCP_FATAL << fmt::format("mmap failed with errno {} :: {}", _err, strerror(_err)); } } @@ -256,10 +258,9 @@ ring_buffer::can_clear() const bool ring_buffer::clear() { - if(!can_clear()) - throw std::runtime_error( - "ring_buffer does not permit invoking clear() member function when the read " - "pointer is non-zero because this introduces thread-safety issues"); + ROCP_CI_LOG_IF(WARNING, !can_clear()) + << "ring_buffer does not permit invoking clear() member function when the read pointer is " + "non-zero because this introduces thread-safety issues"; m_write_count.store(0, std::memory_order_release); return true; diff --git a/projects/rocprofiler-sdk/source/lib/common/elf_utils.cpp b/projects/rocprofiler-sdk/source/lib/common/elf_utils.cpp index c73b6ce19d..e4d4a8c7f0 100644 --- a/projects/rocprofiler-sdk/source/lib/common/elf_utils.cpp +++ b/projects/rocprofiler-sdk/source/lib/common/elf_utils.cpp @@ -22,6 +22,9 @@ #include "lib/common/elf_utils.hpp" +#include + +#include #include #include @@ -47,22 +50,14 @@ namespace { const ELFIO::Elf_Xword PAGE_SIZE = sysconf(_SC_PAGESIZE); -template -std::string -as_hex_string(Tp&& _v, size_t _w = 16) -{ - auto _ss = std::stringstream{}; - _ss.fill('0'); - _ss << "0x" << std::hex << std::setw(_w) << std::forward(_v); - return _ss.str(); -} +using ::rocprofiler::sdk::utility::as_hex; } // namespace SymbolEntry::SymbolEntry(unsigned int _idx, const accessor_type& _accessor) : index{_idx} { if(!_accessor.get_symbol(index, name, value, size, bind, type, section_index, other)) - throw std::runtime_error("Error in ELFIO::symbol_section_accessor::get_symbol"); + ROCP_WARNING << "ELFIO::symbol_section_accessor::get_symbol failed of symbol " << _idx; } DynamicEntry::DynamicEntry(unsigned int _idx, const accessor_type& _accessor) @@ -75,7 +70,7 @@ RelocationEntry::RelocationEntry(unsigned int _idx, const accessor_type& _access : index{_idx} { if(!_accessor.get_entry(_idx, offset, symbol, type, addend)) - throw std::runtime_error("Error in ELFIO::relocation_section_accessor::get_entry"); + ROCP_WARNING << "ELFIO::relocation_section_accessor::get_entry failed for symbol " << _idx; } ElfInfo::ElfInfo(std::string _fname) @@ -111,24 +106,25 @@ read(const std::string& _inp) ROCP_TRACE << "\nReading " << _inp; - if(!reader.load(_inp)) throw std::runtime_error("Could not load elf file " + _inp); + if(!reader.load(_inp)) + ROCP_WARNING << fmt::format("ELF parsing for '{}' did not succeed", _inp); if(reader.get_class() == ELFIO::ELFCLASS32) - ROCP_TRACE << "ELF 32-bit"; + ROCP_TRACE << " - ELF 32-bit"; else - ROCP_TRACE << "ELF 64-bit"; + ROCP_TRACE << " - ELF 64-bit"; - ROCP_TRACE << "ELF file encoding: " + ROCP_TRACE << " - ELF file encoding: " << ((reader.get_encoding() == ELFIO::ELFDATA2LSB) ? std::string_view{"Little endian"} : std::string_view{"Big endian"}); - ROCP_TRACE << "ELF version: " << reader.get_elf_version(); - ROCP_TRACE << "ELF header size: " << reader.get_header_size(); - ROCP_TRACE << "ELF OS ABI: " << reader.get_os_abi(); + ROCP_TRACE << " - ELF version: " << reader.get_elf_version(); + ROCP_TRACE << " - ELF header size: " << reader.get_header_size(); + ROCP_TRACE << " - ELF OS ABI: " << reader.get_os_abi(); // Print ELF file sections info ELFIO::Elf_Half sec_num = reader.sections.size(); - ROCP_TRACE << "Number of sections: " << sec_num; + ROCP_TRACE << " - Number of sections: " << sec_num; for(ELFIO::Elf_Half j = 0; j < sec_num; ++j) { @@ -143,85 +139,85 @@ read(const std::string& _inp) for(ELFIO::Elf_Half j = 0; j < sec_num; ++j) { Section* psec = sections.at(j); - ROCP_TRACE << " [" << j << "] \t" << std::setw(20) << psec->get_name() << "\t : \t" + ROCP_TRACE << " [" << j << "] \t" << std::setw(20) << psec->get_name() << "\t : \t" << "size / entry-size = " << std::setw(6) << psec->get_size() << " / " << std::setw(3) << psec->get_entry_size() - << " | addr: " << as_hex_string(psec->get_address()) - << " | offset: " << as_hex_string(psec->get_offset()); + << " | addr: " << as_hex(psec->get_address(), 16) + << " | offset: " << as_hex(psec->get_offset(), 16); if(psec->get_size() == 0) continue; if(psec->get_type() == ELFIO::SHT_SYMTAB) { const ELFIO::symbol_section_accessor _symbols(reader, psec); - ROCP_TRACE << " Number of symbol entries: " << _symbols.get_symbols_num(); + ROCP_TRACE << " - Number of symbol entries: " << _symbols.get_symbols_num(); for(ELFIO::Elf_Xword k = 0; k < _symbols.get_symbols_num(); ++k) symbol_entries.emplace_back(k, _symbols); } else if(psec->get_type() == ELFIO::SHT_DYNSYM) { const ELFIO::symbol_section_accessor _symbols(reader, psec); - ROCP_TRACE << " Number of dynamic symbol entries: " << _symbols.get_symbols_num(); + ROCP_TRACE << " - Number of dynamic symbol entries: " << _symbols.get_symbols_num(); for(ELFIO::Elf_Xword k = 0; k < _symbols.get_symbols_num(); ++k) dynamic_symbol_entries.emplace_back(k, _symbols); } else if(psec->get_type() == ELFIO::SHT_DYNAMIC) { const ELFIO::dynamic_section_accessor dynamic{reader, psec}; - ROCP_TRACE << " Number of dynamic entries: " << dynamic.get_entries_num(); + ROCP_TRACE << " - Number of dynamic entries: " << dynamic.get_entries_num(); for(ELFIO::Elf_Xword k = 0; k < dynamic.get_entries_num(); ++k) dynamic_entries.emplace_back(k, dynamic); } else if(psec->get_type() == ELFIO::SHT_REL || psec->get_type() == ELFIO::SHT_RELA) { const ELFIO::relocation_section_accessor reloc{reader, psec}; - ROCP_TRACE << " Number of relocation entries: " << reloc.get_entries_num(); + ROCP_TRACE << " - Number of relocation entries: " << reloc.get_entries_num(); for(ELFIO::Elf_Xword k = 0; k < reloc.get_entries_num(); ++k) reloc_entries.emplace_back(k, reloc); } } - ROCP_TRACE << "Symbols:"; + ROCP_TRACE << " - Symbols:"; for(size_t k = 0; k < symbol_entries.size(); ++k) { if(!symbol_entries.at(k).name.empty()) - ROCP_TRACE << " [" << k << "] " << symbol_entries.at(k).name; + ROCP_TRACE << " [" << k << "] " << symbol_entries.at(k).name; } - ROCP_TRACE << "Dynamic Symbols:"; + ROCP_TRACE << " - Dynamic Symbols:"; for(size_t k = 0; k < dynamic_symbol_entries.size(); ++k) { if(!dynamic_symbol_entries.at(k).name.empty()) - ROCP_TRACE << " [" << k << "] " << dynamic_symbol_entries.at(k).name; + ROCP_TRACE << " [" << k << "] " << dynamic_symbol_entries.at(k).name; } - ROCP_TRACE << "Dynamic entries:"; + ROCP_TRACE << " - Dynamic entries:"; for(size_t k = 0; k < dynamic_entries.size(); ++k) { if(!dynamic_entries.at(k).name.empty()) - ROCP_TRACE << " [" << k << "] " << dynamic_entries.at(k).name; + ROCP_TRACE << " [" << k << "] " << dynamic_entries.at(k).name; } - ROCP_TRACE << "Relocation entries:"; + ROCP_TRACE << " - Relocation entries:"; for(size_t k = 0; k < reloc_entries.size(); ++k) { auto _sym_idx = reloc_entries.at(k).symbol; auto _name = std::string{}; if(_sym_idx < symbol_entries.size()) _name = symbol_entries.at(_sym_idx).name; - if(!_name.empty()) ROCP_TRACE << " [" << k << "] " << _name; + if(!_name.empty()) ROCP_TRACE << " [" << k << "] " << _name; } // Print ELF file segments info ELFIO::Elf_Half seg_num = reader.segments.size(); - ROCP_TRACE << "Number of segments: " << seg_num; + ROCP_TRACE << " - Number of segments: " << seg_num; for(ELFIO::Elf_Half j = 0; j < seg_num; ++j) { const ELFIO::segment* pseg = reader.segments[j]; - ROCP_TRACE << " [" << std::setw(2) << j << "] flags: " << as_hex_string(pseg->get_flags()) - << " offset: " << as_hex_string(pseg->get_offset()) - << " align: " << as_hex_string(pseg->get_align()) - << " virt: " << as_hex_string(pseg->get_virtual_address()) - << " phys: " << as_hex_string(pseg->get_physical_address()) + ROCP_TRACE << " [" << std::setw(2) << j << "] flags: " << as_hex(pseg->get_flags(), 16) + << " offset: " << as_hex(pseg->get_offset(), 16) + << " align: " << as_hex(pseg->get_align(), 16) + << " virt: " << as_hex(pseg->get_virtual_address(), 16) + << " phys: " << as_hex(pseg->get_physical_address(), 16) << " fsize: " << std::setw(8) << pseg->get_file_size() << " msize: " << std::setw(8) << pseg->get_memory_size(); } diff --git a/projects/rocprofiler-sdk/source/lib/common/environment.cpp b/projects/rocprofiler-sdk/source/lib/common/environment.cpp index ab727b5666..8f7cd4d045 100644 --- a/projects/rocprofiler-sdk/source/lib/common/environment.cpp +++ b/projects/rocprofiler-sdk/source/lib/common/environment.cpp @@ -24,6 +24,8 @@ #include "lib/common/demangle.hpp" #include "lib/common/logging.hpp" +#include + #include #include #include @@ -62,8 +64,7 @@ get_env(std::string_view env_id, bool _default) { if(std::string_view{env_var}.empty()) { - throw std::runtime_error(std::string{"No boolean value provided for "} + - std::string{env_id}); + ROCP_FATAL << fmt::format("No boolean value provided for {}", env_id); } if(std::string_view{env_var}.find_first_not_of("0123456789") == std::string_view::npos) diff --git a/projects/rocprofiler-sdk/source/lib/common/memory/pool.hpp b/projects/rocprofiler-sdk/source/lib/common/memory/pool.hpp index 8890de28a6..6857d40450 100644 --- a/projects/rocprofiler-sdk/source/lib/common/memory/pool.hpp +++ b/projects/rocprofiler-sdk/source/lib/common/memory/pool.hpp @@ -23,6 +23,7 @@ #pragma once #include "lib/common/defines.hpp" +#include "lib/common/logging.hpp" #include #include @@ -68,8 +69,7 @@ public: { if(!(m_addrs.empty() && m_blocks.empty())) { - throw std::runtime_error{"cannot call pool::rebind() after alloc"}; - ::abort(); + ROCP_FATAL << "cannot call pool::rebind() after alloc"; } m_size = size; diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp index 9c8ef7d28e..7826a28515 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp @@ -124,21 +124,16 @@ write_json(json_output& json_ar, auto code_object_load_info = tool_metadata.get_code_object_load_info(); auto att_filenames = tool_metadata.get_att_filenames(); auto code_object_snapshot_filenames = std::vector{}; + code_object_snapshot_filenames.reserve(code_object_load_info.size()); for(const auto& info : code_object_load_info) - { code_object_snapshot_filenames.emplace_back(fs::path(info.name).filename()); - } + json_ar.setNextName("strings"); json_ar.startNode(); json_ar(cereal::make_nvp("callback_records", callback_name_info)); json_ar(cereal::make_nvp("buffer_records", buffer_name_info)); json_ar(cereal::make_nvp("marker_api", marker_msg_data)); - json_ar( - cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions())); - json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments())); - json_ar(cereal::make_nvp("att_filenames", att_filenames)); - json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames)); { auto _extern_corr_id_strings = std::map{}; if(cfg.kernel_rename) @@ -166,6 +161,12 @@ write_json(json_output& json_ar, json_ar.finishNode(); } + json_ar( + cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions())); + json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments())); + json_ar(cereal::make_nvp("att_filenames", att_filenames)); + json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames)); + json_ar.finishNode(); } diff --git a/projects/rocprofiler-sdk/source/lib/output/metadata.cpp b/projects/rocprofiler-sdk/source/lib/output/metadata.cpp index a79ab335ba..f48ae04fcd 100644 --- a/projects/rocprofiler-sdk/source/lib/output/metadata.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/metadata.cpp @@ -256,7 +256,7 @@ metadata::get_att_filenames() const { for(const auto& file : filenames.second.second) { - data.emplace_back(fs::path(file).filename()); + data.emplace_back(fs::path{file}.filename()); } } return data; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index 6bf7418073..826d1eab47 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -1007,9 +1007,11 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, if(cur_header == nullptr) { - throw std::runtime_error{ - "rocprofiler provided a null pointer to header. this should never happen"}; + ROCP_CI_LOG(WARNING) << "rocprofiler provided a null pointer to buffer record header. " + "this should never happen"; + continue; } + else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING) { if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp index a9948bedf8..94e07c69d7 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp @@ -207,11 +207,17 @@ read_file(const std::string& fname) auto data = std::vector{}; if(!is_readable(fs::path{fname})) - throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)}; + { + ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname); + return data; + } auto ifs = std::ifstream{fname}; if(!ifs || !ifs.good()) - throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)}; + { + ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname); + return data; + } while(true) { @@ -231,11 +237,17 @@ read_map(const std::string& fname) auto data = std::unordered_map{}; if(!is_readable(fs::path{fname})) - throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)}; + { + ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname); + return data; + } auto ifs = std::ifstream{fname}; if(!ifs || !ifs.good()) - throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)}; + { + ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname); + return data; + } auto last_label = std::string{}; while(true) @@ -247,17 +259,23 @@ read_map(const std::string& fname) auto entry = std::string{}; ifs >> entry; if(ifs.eof()) - throw std::runtime_error{ - fmt::format("unexpected file format in '{}' at {}", fname, label)}; + { + ROCP_CI_LOG(WARNING) << fmt::format( + "unexpected file format in '{}' at {}", fname, label); + continue; + } auto ret = data.emplace(label, entry); if(!ret.second) - throw std::runtime_error{ - fmt::format("duplicate entry in '{}': '{}' (='{}'). last label was '{}'", - fname, - label, - entry, - last_label)}; + { + ROCP_CI_LOG(WARNING) << fmt::format( + "duplicate entry in '{}': '{}' (='{}'). last label was '{}'", + fname, + label, + entry, + last_label); + continue; + } if(!label.empty()) last_label = std::move(label); } @@ -305,21 +323,22 @@ read_property(const MapT& data, const std::string& label, Tp& value) constexpr auto max_value = std::numeric_limits::max(); if(local_value < min_value) { - throw std::runtime_error{ - fmt::format("data with label {} has a value (={}) which is less " - "than the min value for the type (={})", - label, - local_value, - min_value)}; + ROCP_CI_LOG(WARNING) << fmt::format( + "data with label {} has a value (={}) which is less " + "than the min value for the type (={})", + label, + local_value, + min_value); + return; } else if(local_value > max_value) { - throw std::runtime_error{fmt::format("data with label {} has a value (={}) which is " - "greater " - "than the max value for the type (={})", - label, - local_value, - max_value)}; + ROCP_CI_LOG(WARNING) << fmt::format("data with label {} has a value (={}) which is " + "greater than the max value for the type (={})", + label, + local_value, + max_value); + return; } if constexpr(std::is_const::value) @@ -544,13 +563,17 @@ using unique_agent_t = std::unique_ptr{}; + + const auto sysfs_nodes_path = fs::path{"/sys/class/kfd/kfd/topology/nodes"}; if(!fs::exists(sysfs_nodes_path)) - throw std::runtime_error{ - fmt::format("sysfs nodes path '{}' does not exist", sysfs_nodes_path.string())}; + { + ROCP_CI_LOG(WARNING) << fmt::format("sysfs nodes path '{}' does not exist", + sysfs_nodes_path.string()); + return data; + } const auto& cpu_info_v = get_cpu_info(); - auto data = std::vector{}; uint64_t idcount = 0; uint64_t nodecount = 0; uint64_t cpucount = 0; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp index 7524f08d5a..f0c249c84d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp @@ -124,11 +124,12 @@ rocprofiler::buffer::instance::emplace(uint32_t category, uint32_t kind, Tp& val { if(buffers.at(idx).capacity() < sizeof(value)) { - auto msg = std::stringstream{}; - msg << "buffer " << buffer_id << " to small (size=" << buffers.at(idx).capacity() - << ") to hold an object of type " << common::cxx_demangle(typeid(value).name()) - << " with size " << sizeof(value); - throw std::runtime_error(msg.str()); + ROCP_CI_LOG(ERROR) << "buffer " << buffer_id + << " too small (size=" << buffers.at(idx).capacity() + << ") to hold an object of type " + << common::cxx_demangle(typeid(value).name()) << " with size " + << sizeof(value); + return false; } if(policy == ROCPROFILER_BUFFER_POLICY_LOSSLESS) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/agent_cache.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/agent_cache.cpp index 63d8c301dc..c705b2201a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/agent_cache.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/agent_cache.cpp @@ -91,18 +91,14 @@ init_cpu_pool(const AmdExtTable& api, rocprofiler::hsa::AgentCache& agent) auto status = api.hsa_amd_agent_iterate_memory_pools_fn(agent.near_cpu(), FindStandardPool, ¶ms); - if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) - { - throw std::runtime_error("Error: Command Buffer Pool is not initialized"); - } + ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) + << "HSA Command Buffer Pool is not initialized"; params.second = &agent.kernarg_pool(); status = api.hsa_amd_agent_iterate_memory_pools_fn(agent.near_cpu(), FindKernArgPool, &(params)); - if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) - { - throw std::runtime_error("Error: Output Buffer Pool is not initialized"); - } + ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) + << "HSA Output Buffer Pool is not initialized"; } void @@ -113,10 +109,8 @@ init_gpu_pool(const AmdExtTable& api, rocprofiler::hsa::AgentCache& agent) auto status = api.hsa_amd_agent_iterate_memory_pools_fn(agent.get_hsa_agent(), FindStandardPool, ¶ms); - if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) - { - throw std::runtime_error("Error: GPU Pool is not initialized"); - } + ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) + << "HSA GPU Pool is not initialized"; } } // namespace @@ -153,10 +147,8 @@ AgentCache::init_device_counting_service_queue(const CoreApiTable& api, UINT32_MAX, UINT32_MAX, &m_profile_queue); - if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) - { - throw std::runtime_error("Error: Queue is not initialized"); - } + ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) + << "HSA Queue is not initialized"; CHECK(ext.hsa_amd_queue_set_priority_fn) << "no hsa_amd_queue_set_priority_fn in api table"; ext.hsa_amd_queue_set_priority_fn(m_profile_queue, HSA_AMD_QUEUE_PRIORITY_HIGH); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp index a78120f0f5..23b3b28cfa 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp @@ -424,8 +424,8 @@ parse_event(std::string_view str) template <> page_migration_record_t parse_event(std::string_view) { - throw std::runtime_error( - "ROCPROFILER_PAGE_MIGRATION_NONE for parsing page migration events should not happen"); + ROCP_CI_LOG(WARNING) + << "ROCPROFILER_PAGE_MIGRATION_NONE for parsing page migration events should not happen"; } template @@ -648,10 +648,11 @@ struct poll_kfd_t [&]() { const auto retcode = pipe2(&thread_pipes[0], DEFAULT_FLAGS); - - if(retcode != 0) - throw std::runtime_error{ - fmt::format("Pipe creation for thread notify failed with {} code\n", retcode)}; + const auto _err = errno; + ROCP_FATAL_IF(retcode != 0) + << fmt::format("Pipe creation for page-migration thread notify returned {} :: {}\n", + retcode, + strerror(_err)); }(); thread_notify = pollfd{ @@ -792,7 +793,11 @@ poll_events(small_vector file_handles) auto poll_ret = poll(file_handles.data(), file_handles.size(), -1); if(poll_ret == -1) - throw std::runtime_error{"Background thread file descriptors are invalid"}; + { + ROCP_CI_LOG(WARNING) + << "Background thread file descriptors for page-migration are invalid"; + return; + } if((exitfd.revents & POLLIN) != 0) { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/hsa_adapter.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/hsa_adapter.cpp index b3db988d4a..93b2711569 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/hsa_adapter.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/hsa_adapter.cpp @@ -78,14 +78,16 @@ amd_intercept_marker_handler_callback(const struct amd_aql_intercept_marker_s* p if(ext_table_->hsa_amd_queue_get_info_fn(queue, HSA_AMD_QUEUE_INFO_AGENT, &hsa_agent) != HSA_STATUS_SUCCESS) { - throw std::runtime_error("Cannot map hsa_queue_t* to hsa_agent_t"); + ROCP_CI_LOG(WARNING) << "Cannot map hsa_queue_t* to hsa_agent_t"; + return; } uint64_t doorbell_id = 0; if(ext_table_->hsa_amd_queue_get_info_fn(queue, HSA_AMD_QUEUE_INFO_DOORBELL_ID, &doorbell_id) != HSA_STATUS_SUCCESS) { - throw std::runtime_error("Cannot map hsa_queue_t* to doorbell_id"); + ROCP_CI_LOG(WARNING) << "Cannot map hsa_queue_t* to doorbell id"; + return; } auto internal_correlation = packet->user_data[0]; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp index 3bc27ae97f..cb912b9ff8 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp @@ -21,12 +21,12 @@ // SOFTWARE. #include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.hpp" - -#include "lib/rocprofiler-sdk/details/kfd_ioctl.h" - #include "lib/common/logging.hpp" +#include "lib/rocprofiler-sdk/details/kfd_ioctl.h" #include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter_types.hpp" +#include + #include #include @@ -61,14 +61,15 @@ struct pc_sampling_ioctl_version_t int kfd_open() { - int fd = -1; - static const char kfd_device_name[] = "/dev/kfd"; + int fd = -1; + constexpr auto* kfd_device_name = "/dev/kfd"; fd = open(kfd_device_name, O_RDWR | O_CLOEXEC); if(fd == -1) { - throw std::runtime_error("Cannot open /dev/kfd"); + ROCP_CI_LOG(WARNING) << fmt::format("Cannot open {} for pc sampling", kfd_device_name); + return -1; } return fd; @@ -485,6 +486,8 @@ ioctl_pcs_create(const rocprofiler_agent_t* agent, args.num_sample_info = 1; args.trace_id = INVALID_TRACE_ID; + if(get_kfd_fd() == -1) return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; + auto ioctl_ret = ioctl(get_kfd_fd(), AMDKFD_IOC_PC_SAMPLE, &args); *ioctl_pcs_id = args.trace_id; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/utils.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/utils.cpp index a8f59d464a..407729d7eb 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/utils.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/utils.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "lib/rocprofiler-sdk/pc_sampling/utils.hpp" +#include "lib/common/logging.hpp" #include "lib/rocprofiler-sdk/pc_sampling/defines.hpp" #if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0 @@ -49,7 +50,7 @@ get_matching_hsa_pcs_method(rocprofiler_pc_sampling_method_t method) case ROCPROFILER_PC_SAMPLING_METHOD_LAST: break; } - throw std::runtime_error("Illegal pc sampling method\n"); + ROCP_FATAL << "Illegal pc sampling method " << method; } hsa_ven_amd_pcs_units_t @@ -66,7 +67,7 @@ get_matching_hsa_pcs_units(rocprofiler_pc_sampling_unit_t unit) case ROCPROFILER_PC_SAMPLING_UNIT_LAST: break; } - throw std::runtime_error("Illegal pc sampling units\n"); + ROCP_FATAL << "Illegal pc sampling unit " << unit; } } // namespace utils } // namespace pc_sampling diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index ac84a9d80e..14c6532d02 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -101,6 +102,49 @@ namespace { namespace fs = ::rocprofiler::common::filesystem; +bool +resolved_exists(std::string_view fname) +{ + if(fs::is_symlink(fname)) + { + // NOTE: Use of ROCP_CI_LOG(WARNING) causes segfault. Likely bc glog is not fully + // initialized + auto _errc = std::error_code{}; + auto _symlinked = fs::read_symlink(fname, _errc); + if(_errc && _symlinked.empty()) + { + ROCP_WARNING << fmt::format("Symbolic link '{}' returned error code {} :: {}", + fname, + _errc.value(), + _errc.message()); + return false; + } + else if(_errc && !_symlinked.empty()) + { + ROCP_WARNING << fmt::format("Symbolic link '{}' -> '{}' returned error code {} :: {}", + fname, + _symlinked.string(), + _errc.value(), + _errc.message()); + return false; + } + + if(_symlinked.is_relative()) _symlinked = fs::path{fname}.parent_path() / _symlinked; + + ROCP_TRACE << fmt::format("Symbolic link:\n\t{}\n\t\t-> {}", fname, _symlinked.string()); + + if(!fs::exists(_symlinked)) + { + ROCP_WARNING << fmt::format("{} is broken symbolic link", fname); + return false; + } + + return resolved_exists(fs::absolute(_symlinked).string()); + } + + return fs::exists(fname); +} + // invoke all rocprofiler_configure symbols bool invoke_client_configures(); @@ -257,14 +301,17 @@ find_clients() { ROCP_INFO << "[ROCP_TOOL_LIBRARIES] searching " << itr << " for rocprofiler_configure"; - if(fs::exists(itr)) + if(fs::exists(itr) && resolved_exists(itr)) { auto elfinfo = common::elf_utils::read(itr); if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"})) { - ROCP_FATAL << "[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '" << itr - << "' did not contain rocprofiler_configure symbol (search method: " - "ELF parsing)"; + ROCP_CI_LOG(WARNING) << fmt::format( + "[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '{}' did not " + "contain rocprofiler_configure symbol (search method: ELF parsing). " + "Attempting dlopen anyway since the library was explicitly listed in " + "ROCP_TOOL_LIBRARIES", + itr); } } @@ -295,10 +342,10 @@ find_clients() { auto _sym = rocprofiler_configure_dlsym(handle); // FATAL bc they explicitly said this was a tool library - ROCP_FATAL_IF(!_sym) + ROCP_CI_LOG_IF(WARNING, !_sym) << "[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '" << itr << "' did not contain rocprofiler_configure symbol (search method: dlsym)"; - if(is_unique_configure_func(_sym)) emplace_client(itr, handle, _sym); + if(_sym && is_unique_configure_func(_sym)) emplace_client(itr, handle, _sym); } } } @@ -323,13 +370,22 @@ find_clients() { ROCP_INFO << "searching " << itr << " for rocprofiler_configure"; - if(fs::exists(itr)) + if(fs::exists(itr) && resolved_exists(itr)) { auto elfinfo = common::elf_utils::read(itr); - if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"})) continue; + if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"})) + { + ROCP_INFO << fmt::format( + "Shared library '{}' did not contain the 'rocprofiler_configure' symbol " + "(search method: ELF parsing) required by rocprofiler-sdk for tools", + itr); + continue; + } } else { + ROCP_INFO << fmt::format( + "Shared library '{}' either does not exist or is a broken symbolic link", itr); continue; }