SDK: remove majority of exceptions (#176)
* SDK: remove majority of exceptions
- replace with ROCP_FATAL, ROCP_CI_LOG(WARNING), etc.
- improve logging of symbolic link
- add --readlink and --realpath (hidden options) to rocprofv3 to follow symlinks for preloaded libraries
* Add rocprofv3 --rocm-root argument
* Fix registration resolved_exists
* Fix rocprofv3_avail.py
* Update logging for rocprofiler_configure search
- relax failure conditions
* Misc clang-tidy fixes
* Fix merge
* Fix merge
---------
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
Co-authored-by: Bhardwaj, Gopesh <Gopesh.Bhardwaj@amd.com>
[ROCm/rocprofiler-sdk commit: 470f347e50]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
95ac740f25
Коммит
e503b1f4cc
@@ -546,6 +546,23 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
nargs="*",
|
||||
)
|
||||
|
||||
advanced_options.add_argument(
|
||||
"--rocm-root",
|
||||
help="Use the given path as the root ROCm path instead of the relative path of this script",
|
||||
type=str,
|
||||
metavar="PATH",
|
||||
default=None,
|
||||
)
|
||||
add_parser_bool_argument(
|
||||
advanced_options,
|
||||
"--readlink",
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
add_parser_bool_argument(
|
||||
advanced_options,
|
||||
"--realpath",
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
# below is available for CI because LD_PRELOADing a library linked to a sanitizer library
|
||||
# causes issues in apps where HIP is part of shared library.
|
||||
add_parser_bool_argument(
|
||||
@@ -874,6 +891,8 @@ def run(app_args, args, **kwargs):
|
||||
|
||||
ROCPROFV3_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
ROCM_DIR = os.path.dirname(ROCPROFV3_DIR)
|
||||
if args.rocm_root is not None:
|
||||
ROCM_DIR = os.path.abspath(args.rocm_root)
|
||||
ROCPROF_TOOL_LIBRARY = f"{ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"
|
||||
ROCPROF_SDK_LIBRARY = f"{ROCM_DIR}/lib/librocprofiler-sdk.so"
|
||||
ROCPROF_ROCTX_LIBRARY = f"{ROCM_DIR}/lib/librocprofiler-sdk-roctx.so"
|
||||
@@ -884,6 +903,22 @@ def run(app_args, args, **kwargs):
|
||||
f"{ROCM_DIR}/libexec/rocprofiler-sdk/librocprofv3-list-avail.so"
|
||||
)
|
||||
|
||||
def resolve_path(val):
|
||||
if not os.path.exists(val):
|
||||
fatal_error(f"{val} does not exist")
|
||||
if os.path.islink(val):
|
||||
if args.readlink:
|
||||
val = os.path.abspath(os.readlink(val))
|
||||
if args.realpath:
|
||||
val = os.path.realpath(val)
|
||||
return val
|
||||
|
||||
ROCPROF_TOOL_LIBRARY = resolve_path(ROCPROF_TOOL_LIBRARY)
|
||||
ROCPROF_SDK_LIBRARY = resolve_path(ROCPROF_SDK_LIBRARY)
|
||||
ROCPROF_ROCTX_LIBRARY = resolve_path(ROCPROF_ROCTX_LIBRARY)
|
||||
ROCPROF_KOKKOSP_LIBRARY = resolve_path(ROCPROF_KOKKOSP_LIBRARY)
|
||||
ROCPROF_LIST_AVAIL_TOOL_LIBRARY = resolve_path(ROCPROF_LIST_AVAIL_TOOL_LIBRARY)
|
||||
|
||||
prepend_preload = [itr for itr in args.preload if itr]
|
||||
append_preload = [
|
||||
ROCPROF_TOOL_LIBRARY,
|
||||
|
||||
Обычный файл → Исполняемый файл
+12
-1
@@ -69,10 +69,21 @@ class pc_config:
|
||||
self.max_interval = max_interval
|
||||
|
||||
|
||||
ROCPROFV3_AVAIL_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
ROCM_DIR = os.path.dirname(ROCPROFV3_AVAIL_DIR)
|
||||
ROCPROF_LIST_AVAIL_TOOL_LIBRARY = (
|
||||
f"{ROCM_DIR}/libexec/rocprofiler-sdk/librocprofv3-list-avail.so"
|
||||
)
|
||||
|
||||
MAX_STR = 256
|
||||
libname = os.environ.get("ROCPROF_LIST_AVAIL_TOOL_LIBRARY")
|
||||
libname = os.environ.get(
|
||||
"ROCPROF_LIST_AVAIL_TOOL_LIBRARY", ROCPROF_LIST_AVAIL_TOOL_LIBRARY
|
||||
)
|
||||
c_lib = ctypes.CDLL(libname)
|
||||
|
||||
if c_lib is None:
|
||||
fatal_error(f"Error opening {libname}")
|
||||
|
||||
c_lib.get_number_of_counters.restype = ctypes.c_ulong
|
||||
c_lib.get_number_of_pc_sample_configs.restype = ctypes.c_ulong
|
||||
c_lib.get_number_of_dimensions.restype = ctypes.c_ulong
|
||||
|
||||
@@ -24,6 +24,8 @@
|
||||
#include "lib/common/environment.hpp"
|
||||
#include "lib/common/units.hpp"
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <atomic>
|
||||
#include <cerrno>
|
||||
@@ -71,9 +73,8 @@ ring_buffer::operator=(ring_buffer&& rhs) noexcept
|
||||
void
|
||||
ring_buffer::init(size_t _size)
|
||||
{
|
||||
if(m_init)
|
||||
throw std::runtime_error("rocprofiler::common::container::base::ring_buffer::init(size_t) "
|
||||
":: already initialized");
|
||||
ROCP_FATAL_IF(m_init)
|
||||
<< "rocprofiler::common::container::base::ring_buffer::init(size_t) :: already initialized";
|
||||
|
||||
m_init = true;
|
||||
|
||||
@@ -85,9 +86,10 @@ ring_buffer::init(size_t _size)
|
||||
if((_size % units::get_page_size()) > 0)
|
||||
{
|
||||
std::ostringstream _oss{};
|
||||
_oss << "Error! size is not a multiple of page size: " << _size << " % "
|
||||
<< units::get_page_size() << " = " << (_size % units::get_page_size());
|
||||
throw std::runtime_error(_oss.str());
|
||||
ROCP_FATAL << fmt::format("Error! size is not a multiple of page size: {} % {} = {}",
|
||||
_size,
|
||||
units::get_page_size(),
|
||||
(_size % units::get_page_size()));
|
||||
}
|
||||
|
||||
m_size = _size;
|
||||
@@ -101,7 +103,7 @@ ring_buffer::init(size_t _size)
|
||||
{
|
||||
destroy();
|
||||
auto _err = errno;
|
||||
throw std::runtime_error(strerror(_err));
|
||||
ROCP_FATAL << fmt::format("mmap failed with errno {} :: {}", _err, strerror(_err));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -256,10 +258,9 @@ ring_buffer::can_clear() const
|
||||
bool
|
||||
ring_buffer::clear()
|
||||
{
|
||||
if(!can_clear())
|
||||
throw std::runtime_error(
|
||||
"ring_buffer does not permit invoking clear() member function when the read "
|
||||
"pointer is non-zero because this introduces thread-safety issues");
|
||||
ROCP_CI_LOG_IF(WARNING, !can_clear())
|
||||
<< "ring_buffer does not permit invoking clear() member function when the read pointer is "
|
||||
"non-zero because this introduces thread-safety issues";
|
||||
|
||||
m_write_count.store(0, std::memory_order_release);
|
||||
return true;
|
||||
|
||||
@@ -22,6 +22,9 @@
|
||||
|
||||
#include "lib/common/elf_utils.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/cxx/utility.hpp>
|
||||
|
||||
#include <fmt/format.h>
|
||||
#include <elfio/elfio.hpp>
|
||||
|
||||
#include <sys/stat.h>
|
||||
@@ -47,22 +50,14 @@ namespace
|
||||
{
|
||||
const ELFIO::Elf_Xword PAGE_SIZE = sysconf(_SC_PAGESIZE);
|
||||
|
||||
template <typename Tp>
|
||||
std::string
|
||||
as_hex_string(Tp&& _v, size_t _w = 16)
|
||||
{
|
||||
auto _ss = std::stringstream{};
|
||||
_ss.fill('0');
|
||||
_ss << "0x" << std::hex << std::setw(_w) << std::forward<Tp>(_v);
|
||||
return _ss.str();
|
||||
}
|
||||
using ::rocprofiler::sdk::utility::as_hex;
|
||||
} // namespace
|
||||
|
||||
SymbolEntry::SymbolEntry(unsigned int _idx, const accessor_type& _accessor)
|
||||
: index{_idx}
|
||||
{
|
||||
if(!_accessor.get_symbol(index, name, value, size, bind, type, section_index, other))
|
||||
throw std::runtime_error("Error in ELFIO::symbol_section_accessor::get_symbol");
|
||||
ROCP_WARNING << "ELFIO::symbol_section_accessor::get_symbol failed of symbol " << _idx;
|
||||
}
|
||||
|
||||
DynamicEntry::DynamicEntry(unsigned int _idx, const accessor_type& _accessor)
|
||||
@@ -75,7 +70,7 @@ RelocationEntry::RelocationEntry(unsigned int _idx, const accessor_type& _access
|
||||
: index{_idx}
|
||||
{
|
||||
if(!_accessor.get_entry(_idx, offset, symbol, type, addend))
|
||||
throw std::runtime_error("Error in ELFIO::relocation_section_accessor::get_entry");
|
||||
ROCP_WARNING << "ELFIO::relocation_section_accessor::get_entry failed for symbol " << _idx;
|
||||
}
|
||||
|
||||
ElfInfo::ElfInfo(std::string _fname)
|
||||
@@ -111,24 +106,25 @@ read(const std::string& _inp)
|
||||
|
||||
ROCP_TRACE << "\nReading " << _inp;
|
||||
|
||||
if(!reader.load(_inp)) throw std::runtime_error("Could not load elf file " + _inp);
|
||||
if(!reader.load(_inp))
|
||||
ROCP_WARNING << fmt::format("ELF parsing for '{}' did not succeed", _inp);
|
||||
|
||||
if(reader.get_class() == ELFIO::ELFCLASS32)
|
||||
ROCP_TRACE << "ELF 32-bit";
|
||||
ROCP_TRACE << " - ELF 32-bit";
|
||||
else
|
||||
ROCP_TRACE << "ELF 64-bit";
|
||||
ROCP_TRACE << " - ELF 64-bit";
|
||||
|
||||
ROCP_TRACE << "ELF file encoding: "
|
||||
ROCP_TRACE << " - ELF file encoding: "
|
||||
<< ((reader.get_encoding() == ELFIO::ELFDATA2LSB) ? std::string_view{"Little endian"}
|
||||
: std::string_view{"Big endian"});
|
||||
|
||||
ROCP_TRACE << "ELF version: " << reader.get_elf_version();
|
||||
ROCP_TRACE << "ELF header size: " << reader.get_header_size();
|
||||
ROCP_TRACE << "ELF OS ABI: " << reader.get_os_abi();
|
||||
ROCP_TRACE << " - ELF version: " << reader.get_elf_version();
|
||||
ROCP_TRACE << " - ELF header size: " << reader.get_header_size();
|
||||
ROCP_TRACE << " - ELF OS ABI: " << reader.get_os_abi();
|
||||
|
||||
// Print ELF file sections info
|
||||
ELFIO::Elf_Half sec_num = reader.sections.size();
|
||||
ROCP_TRACE << "Number of sections: " << sec_num;
|
||||
ROCP_TRACE << " - Number of sections: " << sec_num;
|
||||
|
||||
for(ELFIO::Elf_Half j = 0; j < sec_num; ++j)
|
||||
{
|
||||
@@ -143,85 +139,85 @@ read(const std::string& _inp)
|
||||
for(ELFIO::Elf_Half j = 0; j < sec_num; ++j)
|
||||
{
|
||||
Section* psec = sections.at(j);
|
||||
ROCP_TRACE << " [" << j << "] \t" << std::setw(20) << psec->get_name() << "\t : \t"
|
||||
ROCP_TRACE << " [" << j << "] \t" << std::setw(20) << psec->get_name() << "\t : \t"
|
||||
<< "size / entry-size = " << std::setw(6) << psec->get_size() << " / "
|
||||
<< std::setw(3) << psec->get_entry_size()
|
||||
<< " | addr: " << as_hex_string(psec->get_address())
|
||||
<< " | offset: " << as_hex_string(psec->get_offset());
|
||||
<< " | addr: " << as_hex(psec->get_address(), 16)
|
||||
<< " | offset: " << as_hex(psec->get_offset(), 16);
|
||||
|
||||
if(psec->get_size() == 0) continue;
|
||||
|
||||
if(psec->get_type() == ELFIO::SHT_SYMTAB)
|
||||
{
|
||||
const ELFIO::symbol_section_accessor _symbols(reader, psec);
|
||||
ROCP_TRACE << " Number of symbol entries: " << _symbols.get_symbols_num();
|
||||
ROCP_TRACE << " - Number of symbol entries: " << _symbols.get_symbols_num();
|
||||
for(ELFIO::Elf_Xword k = 0; k < _symbols.get_symbols_num(); ++k)
|
||||
symbol_entries.emplace_back(k, _symbols);
|
||||
}
|
||||
else if(psec->get_type() == ELFIO::SHT_DYNSYM)
|
||||
{
|
||||
const ELFIO::symbol_section_accessor _symbols(reader, psec);
|
||||
ROCP_TRACE << " Number of dynamic symbol entries: " << _symbols.get_symbols_num();
|
||||
ROCP_TRACE << " - Number of dynamic symbol entries: " << _symbols.get_symbols_num();
|
||||
for(ELFIO::Elf_Xword k = 0; k < _symbols.get_symbols_num(); ++k)
|
||||
dynamic_symbol_entries.emplace_back(k, _symbols);
|
||||
}
|
||||
else if(psec->get_type() == ELFIO::SHT_DYNAMIC)
|
||||
{
|
||||
const ELFIO::dynamic_section_accessor dynamic{reader, psec};
|
||||
ROCP_TRACE << " Number of dynamic entries: " << dynamic.get_entries_num();
|
||||
ROCP_TRACE << " - Number of dynamic entries: " << dynamic.get_entries_num();
|
||||
for(ELFIO::Elf_Xword k = 0; k < dynamic.get_entries_num(); ++k)
|
||||
dynamic_entries.emplace_back(k, dynamic);
|
||||
}
|
||||
else if(psec->get_type() == ELFIO::SHT_REL || psec->get_type() == ELFIO::SHT_RELA)
|
||||
{
|
||||
const ELFIO::relocation_section_accessor reloc{reader, psec};
|
||||
ROCP_TRACE << " Number of relocation entries: " << reloc.get_entries_num();
|
||||
ROCP_TRACE << " - Number of relocation entries: " << reloc.get_entries_num();
|
||||
for(ELFIO::Elf_Xword k = 0; k < reloc.get_entries_num(); ++k)
|
||||
reloc_entries.emplace_back(k, reloc);
|
||||
}
|
||||
}
|
||||
|
||||
ROCP_TRACE << "Symbols:";
|
||||
ROCP_TRACE << " - Symbols:";
|
||||
for(size_t k = 0; k < symbol_entries.size(); ++k)
|
||||
{
|
||||
if(!symbol_entries.at(k).name.empty())
|
||||
ROCP_TRACE << " [" << k << "] " << symbol_entries.at(k).name;
|
||||
ROCP_TRACE << " [" << k << "] " << symbol_entries.at(k).name;
|
||||
}
|
||||
|
||||
ROCP_TRACE << "Dynamic Symbols:";
|
||||
ROCP_TRACE << " - Dynamic Symbols:";
|
||||
for(size_t k = 0; k < dynamic_symbol_entries.size(); ++k)
|
||||
{
|
||||
if(!dynamic_symbol_entries.at(k).name.empty())
|
||||
ROCP_TRACE << " [" << k << "] " << dynamic_symbol_entries.at(k).name;
|
||||
ROCP_TRACE << " [" << k << "] " << dynamic_symbol_entries.at(k).name;
|
||||
}
|
||||
|
||||
ROCP_TRACE << "Dynamic entries:";
|
||||
ROCP_TRACE << " - Dynamic entries:";
|
||||
for(size_t k = 0; k < dynamic_entries.size(); ++k)
|
||||
{
|
||||
if(!dynamic_entries.at(k).name.empty())
|
||||
ROCP_TRACE << " [" << k << "] " << dynamic_entries.at(k).name;
|
||||
ROCP_TRACE << " [" << k << "] " << dynamic_entries.at(k).name;
|
||||
}
|
||||
|
||||
ROCP_TRACE << "Relocation entries:";
|
||||
ROCP_TRACE << " - Relocation entries:";
|
||||
for(size_t k = 0; k < reloc_entries.size(); ++k)
|
||||
{
|
||||
auto _sym_idx = reloc_entries.at(k).symbol;
|
||||
auto _name = std::string{};
|
||||
if(_sym_idx < symbol_entries.size()) _name = symbol_entries.at(_sym_idx).name;
|
||||
if(!_name.empty()) ROCP_TRACE << " [" << k << "] " << _name;
|
||||
if(!_name.empty()) ROCP_TRACE << " [" << k << "] " << _name;
|
||||
}
|
||||
|
||||
// Print ELF file segments info
|
||||
ELFIO::Elf_Half seg_num = reader.segments.size();
|
||||
ROCP_TRACE << "Number of segments: " << seg_num;
|
||||
ROCP_TRACE << " - Number of segments: " << seg_num;
|
||||
for(ELFIO::Elf_Half j = 0; j < seg_num; ++j)
|
||||
{
|
||||
const ELFIO::segment* pseg = reader.segments[j];
|
||||
ROCP_TRACE << " [" << std::setw(2) << j << "] flags: " << as_hex_string(pseg->get_flags())
|
||||
<< " offset: " << as_hex_string(pseg->get_offset())
|
||||
<< " align: " << as_hex_string(pseg->get_align())
|
||||
<< " virt: " << as_hex_string(pseg->get_virtual_address())
|
||||
<< " phys: " << as_hex_string(pseg->get_physical_address())
|
||||
ROCP_TRACE << " [" << std::setw(2) << j << "] flags: " << as_hex(pseg->get_flags(), 16)
|
||||
<< " offset: " << as_hex(pseg->get_offset(), 16)
|
||||
<< " align: " << as_hex(pseg->get_align(), 16)
|
||||
<< " virt: " << as_hex(pseg->get_virtual_address(), 16)
|
||||
<< " phys: " << as_hex(pseg->get_physical_address(), 16)
|
||||
<< " fsize: " << std::setw(8) << pseg->get_file_size()
|
||||
<< " msize: " << std::setw(8) << pseg->get_memory_size();
|
||||
}
|
||||
|
||||
@@ -24,6 +24,8 @@
|
||||
#include "lib/common/demangle.hpp"
|
||||
#include "lib/common/logging.hpp"
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <cctype>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
@@ -62,8 +64,7 @@ get_env(std::string_view env_id, bool _default)
|
||||
{
|
||||
if(std::string_view{env_var}.empty())
|
||||
{
|
||||
throw std::runtime_error(std::string{"No boolean value provided for "} +
|
||||
std::string{env_id});
|
||||
ROCP_FATAL << fmt::format("No boolean value provided for {}", env_id);
|
||||
}
|
||||
|
||||
if(std::string_view{env_var}.find_first_not_of("0123456789") == std::string_view::npos)
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "lib/common/defines.hpp"
|
||||
#include "lib/common/logging.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
@@ -68,8 +69,7 @@ public:
|
||||
{
|
||||
if(!(m_addrs.empty() && m_blocks.empty()))
|
||||
{
|
||||
throw std::runtime_error{"cannot call pool::rebind() after alloc"};
|
||||
::abort();
|
||||
ROCP_FATAL << "cannot call pool::rebind() after alloc";
|
||||
}
|
||||
|
||||
m_size = size;
|
||||
|
||||
@@ -124,21 +124,16 @@ write_json(json_output& json_ar,
|
||||
auto code_object_load_info = tool_metadata.get_code_object_load_info();
|
||||
auto att_filenames = tool_metadata.get_att_filenames();
|
||||
auto code_object_snapshot_filenames = std::vector<std::string>{};
|
||||
|
||||
code_object_snapshot_filenames.reserve(code_object_load_info.size());
|
||||
for(const auto& info : code_object_load_info)
|
||||
{
|
||||
code_object_snapshot_filenames.emplace_back(fs::path(info.name).filename());
|
||||
}
|
||||
|
||||
json_ar.setNextName("strings");
|
||||
json_ar.startNode();
|
||||
json_ar(cereal::make_nvp("callback_records", callback_name_info));
|
||||
json_ar(cereal::make_nvp("buffer_records", buffer_name_info));
|
||||
json_ar(cereal::make_nvp("marker_api", marker_msg_data));
|
||||
json_ar(
|
||||
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
|
||||
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
|
||||
json_ar(cereal::make_nvp("att_filenames", att_filenames));
|
||||
json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames));
|
||||
{
|
||||
auto _extern_corr_id_strings = std::map<size_t, std::string>{};
|
||||
if(cfg.kernel_rename)
|
||||
@@ -166,6 +161,12 @@ write_json(json_output& json_ar,
|
||||
json_ar.finishNode();
|
||||
}
|
||||
|
||||
json_ar(
|
||||
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
|
||||
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
|
||||
json_ar(cereal::make_nvp("att_filenames", att_filenames));
|
||||
json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames));
|
||||
|
||||
json_ar.finishNode();
|
||||
}
|
||||
|
||||
|
||||
@@ -256,7 +256,7 @@ metadata::get_att_filenames() const
|
||||
{
|
||||
for(const auto& file : filenames.second.second)
|
||||
{
|
||||
data.emplace_back(fs::path(file).filename());
|
||||
data.emplace_back(fs::path{file}.filename());
|
||||
}
|
||||
}
|
||||
return data;
|
||||
|
||||
@@ -1007,9 +1007,11 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
|
||||
|
||||
if(cur_header == nullptr)
|
||||
{
|
||||
throw std::runtime_error{
|
||||
"rocprofiler provided a null pointer to header. this should never happen"};
|
||||
ROCP_CI_LOG(WARNING) << "rocprofiler provided a null pointer to buffer record header. "
|
||||
"this should never happen";
|
||||
continue;
|
||||
}
|
||||
|
||||
else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING)
|
||||
{
|
||||
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE)
|
||||
|
||||
@@ -207,11 +207,17 @@ read_file(const std::string& fname)
|
||||
auto data = std::vector<std::string>{};
|
||||
|
||||
if(!is_readable(fs::path{fname}))
|
||||
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
|
||||
return data;
|
||||
}
|
||||
|
||||
auto ifs = std::ifstream{fname};
|
||||
if(!ifs || !ifs.good())
|
||||
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
|
||||
return data;
|
||||
}
|
||||
|
||||
while(true)
|
||||
{
|
||||
@@ -231,11 +237,17 @@ read_map(const std::string& fname)
|
||||
auto data = std::unordered_map<std::string, std::string>{};
|
||||
|
||||
if(!is_readable(fs::path{fname}))
|
||||
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
|
||||
return data;
|
||||
}
|
||||
|
||||
auto ifs = std::ifstream{fname};
|
||||
if(!ifs || !ifs.good())
|
||||
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
|
||||
return data;
|
||||
}
|
||||
|
||||
auto last_label = std::string{};
|
||||
while(true)
|
||||
@@ -247,17 +259,23 @@ read_map(const std::string& fname)
|
||||
auto entry = std::string{};
|
||||
ifs >> entry;
|
||||
if(ifs.eof())
|
||||
throw std::runtime_error{
|
||||
fmt::format("unexpected file format in '{}' at {}", fname, label)};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << fmt::format(
|
||||
"unexpected file format in '{}' at {}", fname, label);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto ret = data.emplace(label, entry);
|
||||
if(!ret.second)
|
||||
throw std::runtime_error{
|
||||
fmt::format("duplicate entry in '{}': '{}' (='{}'). last label was '{}'",
|
||||
fname,
|
||||
label,
|
||||
entry,
|
||||
last_label)};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << fmt::format(
|
||||
"duplicate entry in '{}': '{}' (='{}'). last label was '{}'",
|
||||
fname,
|
||||
label,
|
||||
entry,
|
||||
last_label);
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!label.empty()) last_label = std::move(label);
|
||||
}
|
||||
@@ -305,21 +323,22 @@ read_property(const MapT& data, const std::string& label, Tp& value)
|
||||
constexpr auto max_value = std::numeric_limits<Tp>::max();
|
||||
if(local_value < min_value)
|
||||
{
|
||||
throw std::runtime_error{
|
||||
fmt::format("data with label {} has a value (={}) which is less "
|
||||
"than the min value for the type (={})",
|
||||
label,
|
||||
local_value,
|
||||
min_value)};
|
||||
ROCP_CI_LOG(WARNING) << fmt::format(
|
||||
"data with label {} has a value (={}) which is less "
|
||||
"than the min value for the type (={})",
|
||||
label,
|
||||
local_value,
|
||||
min_value);
|
||||
return;
|
||||
}
|
||||
else if(local_value > max_value)
|
||||
{
|
||||
throw std::runtime_error{fmt::format("data with label {} has a value (={}) which is "
|
||||
"greater "
|
||||
"than the max value for the type (={})",
|
||||
label,
|
||||
local_value,
|
||||
max_value)};
|
||||
ROCP_CI_LOG(WARNING) << fmt::format("data with label {} has a value (={}) which is "
|
||||
"greater than the max value for the type (={})",
|
||||
label,
|
||||
local_value,
|
||||
max_value);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr(std::is_const<Tp>::value)
|
||||
@@ -544,13 +563,17 @@ using unique_agent_t = std::unique_ptr<rocprofiler_agent_t, void (*)(rocprofiler
|
||||
auto
|
||||
read_topology()
|
||||
{
|
||||
auto sysfs_nodes_path = fs::path{"/sys/class/kfd/kfd/topology/nodes/"};
|
||||
auto data = std::vector<unique_agent_t>{};
|
||||
|
||||
const auto sysfs_nodes_path = fs::path{"/sys/class/kfd/kfd/topology/nodes"};
|
||||
if(!fs::exists(sysfs_nodes_path))
|
||||
throw std::runtime_error{
|
||||
fmt::format("sysfs nodes path '{}' does not exist", sysfs_nodes_path.string())};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << fmt::format("sysfs nodes path '{}' does not exist",
|
||||
sysfs_nodes_path.string());
|
||||
return data;
|
||||
}
|
||||
|
||||
const auto& cpu_info_v = get_cpu_info();
|
||||
auto data = std::vector<unique_agent_t>{};
|
||||
uint64_t idcount = 0;
|
||||
uint64_t nodecount = 0;
|
||||
uint64_t cpucount = 0;
|
||||
|
||||
@@ -124,11 +124,12 @@ rocprofiler::buffer::instance::emplace(uint32_t category, uint32_t kind, Tp& val
|
||||
{
|
||||
if(buffers.at(idx).capacity() < sizeof(value))
|
||||
{
|
||||
auto msg = std::stringstream{};
|
||||
msg << "buffer " << buffer_id << " to small (size=" << buffers.at(idx).capacity()
|
||||
<< ") to hold an object of type " << common::cxx_demangle(typeid(value).name())
|
||||
<< " with size " << sizeof(value);
|
||||
throw std::runtime_error(msg.str());
|
||||
ROCP_CI_LOG(ERROR) << "buffer " << buffer_id
|
||||
<< " too small (size=" << buffers.at(idx).capacity()
|
||||
<< ") to hold an object of type "
|
||||
<< common::cxx_demangle(typeid(value).name()) << " with size "
|
||||
<< sizeof(value);
|
||||
return false;
|
||||
}
|
||||
|
||||
if(policy == ROCPROFILER_BUFFER_POLICY_LOSSLESS)
|
||||
|
||||
@@ -91,18 +91,14 @@ init_cpu_pool(const AmdExtTable& api, rocprofiler::hsa::AgentCache& agent)
|
||||
|
||||
auto status =
|
||||
api.hsa_amd_agent_iterate_memory_pools_fn(agent.near_cpu(), FindStandardPool, ¶ms);
|
||||
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
{
|
||||
throw std::runtime_error("Error: Command Buffer Pool is not initialized");
|
||||
}
|
||||
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
<< "HSA Command Buffer Pool is not initialized";
|
||||
|
||||
params.second = &agent.kernarg_pool();
|
||||
status =
|
||||
api.hsa_amd_agent_iterate_memory_pools_fn(agent.near_cpu(), FindKernArgPool, &(params));
|
||||
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
{
|
||||
throw std::runtime_error("Error: Output Buffer Pool is not initialized");
|
||||
}
|
||||
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
<< "HSA Output Buffer Pool is not initialized";
|
||||
}
|
||||
|
||||
void
|
||||
@@ -113,10 +109,8 @@ init_gpu_pool(const AmdExtTable& api, rocprofiler::hsa::AgentCache& agent)
|
||||
auto status =
|
||||
api.hsa_amd_agent_iterate_memory_pools_fn(agent.get_hsa_agent(), FindStandardPool, ¶ms);
|
||||
|
||||
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
{
|
||||
throw std::runtime_error("Error: GPU Pool is not initialized");
|
||||
}
|
||||
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
<< "HSA GPU Pool is not initialized";
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -153,10 +147,8 @@ AgentCache::init_device_counting_service_queue(const CoreApiTable& api,
|
||||
UINT32_MAX,
|
||||
UINT32_MAX,
|
||||
&m_profile_queue);
|
||||
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
{
|
||||
throw std::runtime_error("Error: Queue is not initialized");
|
||||
}
|
||||
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
<< "HSA Queue is not initialized";
|
||||
|
||||
CHECK(ext.hsa_amd_queue_set_priority_fn) << "no hsa_amd_queue_set_priority_fn in api table";
|
||||
ext.hsa_amd_queue_set_priority_fn(m_profile_queue, HSA_AMD_QUEUE_PRIORITY_HIGH);
|
||||
|
||||
+12
-7
@@ -424,8 +424,8 @@ parse_event<ROCPROFILER_PAGE_MIGRATION_DROPPED_EVENT>(std::string_view str)
|
||||
template <>
|
||||
page_migration_record_t parse_event<ROCPROFILER_PAGE_MIGRATION_NONE>(std::string_view)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"ROCPROFILER_PAGE_MIGRATION_NONE for parsing page migration events should not happen");
|
||||
ROCP_CI_LOG(WARNING)
|
||||
<< "ROCPROFILER_PAGE_MIGRATION_NONE for parsing page migration events should not happen";
|
||||
}
|
||||
|
||||
template <size_t OpInx, size_t... OpInxs>
|
||||
@@ -648,10 +648,11 @@ struct poll_kfd_t
|
||||
|
||||
[&]() {
|
||||
const auto retcode = pipe2(&thread_pipes[0], DEFAULT_FLAGS);
|
||||
|
||||
if(retcode != 0)
|
||||
throw std::runtime_error{
|
||||
fmt::format("Pipe creation for thread notify failed with {} code\n", retcode)};
|
||||
const auto _err = errno;
|
||||
ROCP_FATAL_IF(retcode != 0)
|
||||
<< fmt::format("Pipe creation for page-migration thread notify returned {} :: {}\n",
|
||||
retcode,
|
||||
strerror(_err));
|
||||
}();
|
||||
|
||||
thread_notify = pollfd{
|
||||
@@ -792,7 +793,11 @@ poll_events(small_vector<pollfd> file_handles)
|
||||
auto poll_ret = poll(file_handles.data(), file_handles.size(), -1);
|
||||
|
||||
if(poll_ret == -1)
|
||||
throw std::runtime_error{"Background thread file descriptors are invalid"};
|
||||
{
|
||||
ROCP_CI_LOG(WARNING)
|
||||
<< "Background thread file descriptors for page-migration are invalid";
|
||||
return;
|
||||
}
|
||||
|
||||
if((exitfd.revents & POLLIN) != 0)
|
||||
{
|
||||
|
||||
+4
-2
@@ -78,14 +78,16 @@ amd_intercept_marker_handler_callback(const struct amd_aql_intercept_marker_s* p
|
||||
if(ext_table_->hsa_amd_queue_get_info_fn(queue, HSA_AMD_QUEUE_INFO_AGENT, &hsa_agent) !=
|
||||
HSA_STATUS_SUCCESS)
|
||||
{
|
||||
throw std::runtime_error("Cannot map hsa_queue_t* to hsa_agent_t");
|
||||
ROCP_CI_LOG(WARNING) << "Cannot map hsa_queue_t* to hsa_agent_t";
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t doorbell_id = 0;
|
||||
if(ext_table_->hsa_amd_queue_get_info_fn(queue, HSA_AMD_QUEUE_INFO_DOORBELL_ID, &doorbell_id) !=
|
||||
HSA_STATUS_SUCCESS)
|
||||
{
|
||||
throw std::runtime_error("Cannot map hsa_queue_t* to doorbell_id");
|
||||
ROCP_CI_LOG(WARNING) << "Cannot map hsa_queue_t* to doorbell id";
|
||||
return;
|
||||
}
|
||||
|
||||
auto internal_correlation = packet->user_data[0];
|
||||
|
||||
+9
-6
@@ -21,12 +21,12 @@
|
||||
// SOFTWARE.
|
||||
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.hpp"
|
||||
|
||||
#include "lib/rocprofiler-sdk/details/kfd_ioctl.h"
|
||||
|
||||
#include "lib/common/logging.hpp"
|
||||
#include "lib/rocprofiler-sdk/details/kfd_ioctl.h"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter_types.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
|
||||
#include <sys/ioctl.h>
|
||||
|
||||
#include <fcntl.h>
|
||||
@@ -61,14 +61,15 @@ struct pc_sampling_ioctl_version_t
|
||||
int
|
||||
kfd_open()
|
||||
{
|
||||
int fd = -1;
|
||||
static const char kfd_device_name[] = "/dev/kfd";
|
||||
int fd = -1;
|
||||
constexpr auto* kfd_device_name = "/dev/kfd";
|
||||
|
||||
fd = open(kfd_device_name, O_RDWR | O_CLOEXEC);
|
||||
|
||||
if(fd == -1)
|
||||
{
|
||||
throw std::runtime_error("Cannot open /dev/kfd");
|
||||
ROCP_CI_LOG(WARNING) << fmt::format("Cannot open {} for pc sampling", kfd_device_name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return fd;
|
||||
@@ -485,6 +486,8 @@ ioctl_pcs_create(const rocprofiler_agent_t* agent,
|
||||
args.num_sample_info = 1;
|
||||
args.trace_id = INVALID_TRACE_ID;
|
||||
|
||||
if(get_kfd_fd() == -1) return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE;
|
||||
|
||||
auto ioctl_ret = ioctl(get_kfd_fd(), AMDKFD_IOC_PC_SAMPLE, &args);
|
||||
*ioctl_pcs_id = args.trace_id;
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
// SOFTWARE.
|
||||
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/utils.hpp"
|
||||
#include "lib/common/logging.hpp"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/defines.hpp"
|
||||
|
||||
#if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0
|
||||
@@ -49,7 +50,7 @@ get_matching_hsa_pcs_method(rocprofiler_pc_sampling_method_t method)
|
||||
case ROCPROFILER_PC_SAMPLING_METHOD_LAST: break;
|
||||
}
|
||||
|
||||
throw std::runtime_error("Illegal pc sampling method\n");
|
||||
ROCP_FATAL << "Illegal pc sampling method " << method;
|
||||
}
|
||||
|
||||
hsa_ven_amd_pcs_units_t
|
||||
@@ -66,7 +67,7 @@ get_matching_hsa_pcs_units(rocprofiler_pc_sampling_unit_t unit)
|
||||
case ROCPROFILER_PC_SAMPLING_UNIT_LAST: break;
|
||||
}
|
||||
|
||||
throw std::runtime_error("Illegal pc sampling units\n");
|
||||
ROCP_FATAL << "Illegal pc sampling unit " << unit;
|
||||
}
|
||||
} // namespace utils
|
||||
} // namespace pc_sampling
|
||||
|
||||
@@ -77,6 +77,7 @@
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <system_error>
|
||||
#include <thread>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
@@ -101,6 +102,49 @@ namespace
|
||||
{
|
||||
namespace fs = ::rocprofiler::common::filesystem;
|
||||
|
||||
bool
|
||||
resolved_exists(std::string_view fname)
|
||||
{
|
||||
if(fs::is_symlink(fname))
|
||||
{
|
||||
// NOTE: Use of ROCP_CI_LOG(WARNING) causes segfault. Likely bc glog is not fully
|
||||
// initialized
|
||||
auto _errc = std::error_code{};
|
||||
auto _symlinked = fs::read_symlink(fname, _errc);
|
||||
if(_errc && _symlinked.empty())
|
||||
{
|
||||
ROCP_WARNING << fmt::format("Symbolic link '{}' returned error code {} :: {}",
|
||||
fname,
|
||||
_errc.value(),
|
||||
_errc.message());
|
||||
return false;
|
||||
}
|
||||
else if(_errc && !_symlinked.empty())
|
||||
{
|
||||
ROCP_WARNING << fmt::format("Symbolic link '{}' -> '{}' returned error code {} :: {}",
|
||||
fname,
|
||||
_symlinked.string(),
|
||||
_errc.value(),
|
||||
_errc.message());
|
||||
return false;
|
||||
}
|
||||
|
||||
if(_symlinked.is_relative()) _symlinked = fs::path{fname}.parent_path() / _symlinked;
|
||||
|
||||
ROCP_TRACE << fmt::format("Symbolic link:\n\t{}\n\t\t-> {}", fname, _symlinked.string());
|
||||
|
||||
if(!fs::exists(_symlinked))
|
||||
{
|
||||
ROCP_WARNING << fmt::format("{} is broken symbolic link", fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
return resolved_exists(fs::absolute(_symlinked).string());
|
||||
}
|
||||
|
||||
return fs::exists(fname);
|
||||
}
|
||||
|
||||
// invoke all rocprofiler_configure symbols
|
||||
bool
|
||||
invoke_client_configures();
|
||||
@@ -257,14 +301,17 @@ find_clients()
|
||||
{
|
||||
ROCP_INFO << "[ROCP_TOOL_LIBRARIES] searching " << itr << " for rocprofiler_configure";
|
||||
|
||||
if(fs::exists(itr))
|
||||
if(fs::exists(itr) && resolved_exists(itr))
|
||||
{
|
||||
auto elfinfo = common::elf_utils::read(itr);
|
||||
if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"}))
|
||||
{
|
||||
ROCP_FATAL << "[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '" << itr
|
||||
<< "' did not contain rocprofiler_configure symbol (search method: "
|
||||
"ELF parsing)";
|
||||
ROCP_CI_LOG(WARNING) << fmt::format(
|
||||
"[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '{}' did not "
|
||||
"contain rocprofiler_configure symbol (search method: ELF parsing). "
|
||||
"Attempting dlopen anyway since the library was explicitly listed in "
|
||||
"ROCP_TOOL_LIBRARIES",
|
||||
itr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -295,10 +342,10 @@ find_clients()
|
||||
{
|
||||
auto _sym = rocprofiler_configure_dlsym(handle);
|
||||
// FATAL bc they explicitly said this was a tool library
|
||||
ROCP_FATAL_IF(!_sym)
|
||||
ROCP_CI_LOG_IF(WARNING, !_sym)
|
||||
<< "[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '" << itr
|
||||
<< "' did not contain rocprofiler_configure symbol (search method: dlsym)";
|
||||
if(is_unique_configure_func(_sym)) emplace_client(itr, handle, _sym);
|
||||
if(_sym && is_unique_configure_func(_sym)) emplace_client(itr, handle, _sym);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -323,13 +370,22 @@ find_clients()
|
||||
{
|
||||
ROCP_INFO << "searching " << itr << " for rocprofiler_configure";
|
||||
|
||||
if(fs::exists(itr))
|
||||
if(fs::exists(itr) && resolved_exists(itr))
|
||||
{
|
||||
auto elfinfo = common::elf_utils::read(itr);
|
||||
if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"})) continue;
|
||||
if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"}))
|
||||
{
|
||||
ROCP_INFO << fmt::format(
|
||||
"Shared library '{}' did not contain the 'rocprofiler_configure' symbol "
|
||||
"(search method: ELF parsing) required by rocprofiler-sdk for tools",
|
||||
itr);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCP_INFO << fmt::format(
|
||||
"Shared library '{}' either does not exist or is a broken symbolic link", itr);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user