SDK: remove majority of exceptions (#176)

* SDK: remove majority of exceptions

- replace with ROCP_FATAL, ROCP_CI_LOG(WARNING), etc.
- improve logging of symbolic link
- add --readlink and --realpath (hidden options) to rocprofv3 to follow symlinks for preloaded libraries

* Add rocprofv3 --rocm-root argument

* Fix registration resolved_exists

* Fix rocprofv3_avail.py

* Update logging for rocprofiler_configure search

- relax failure conditions

* Misc clang-tidy fixes

* Fix merge

* Fix merge

---------

Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
Co-authored-by: Bhardwaj, Gopesh <Gopesh.Bhardwaj@amd.com>

[ROCm/rocprofiler-sdk commit: 470f347e50]
Этот коммит содержится в:
Madsen, Jonathan
2025-02-18 10:44:37 -06:00
коммит произвёл GitHub
родитель 95ac740f25
Коммит e503b1f4cc
17 изменённых файлов: 270 добавлений и 140 удалений
+35
Просмотреть файл
@@ -546,6 +546,23 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
nargs="*",
)
advanced_options.add_argument(
"--rocm-root",
help="Use the given path as the root ROCm path instead of the relative path of this script",
type=str,
metavar="PATH",
default=None,
)
add_parser_bool_argument(
advanced_options,
"--readlink",
help=argparse.SUPPRESS,
)
add_parser_bool_argument(
advanced_options,
"--realpath",
help=argparse.SUPPRESS,
)
# below is available for CI because LD_PRELOADing a library linked to a sanitizer library
# causes issues in apps where HIP is part of shared library.
add_parser_bool_argument(
@@ -874,6 +891,8 @@ def run(app_args, args, **kwargs):
ROCPROFV3_DIR = os.path.dirname(os.path.realpath(__file__))
ROCM_DIR = os.path.dirname(ROCPROFV3_DIR)
if args.rocm_root is not None:
ROCM_DIR = os.path.abspath(args.rocm_root)
ROCPROF_TOOL_LIBRARY = f"{ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"
ROCPROF_SDK_LIBRARY = f"{ROCM_DIR}/lib/librocprofiler-sdk.so"
ROCPROF_ROCTX_LIBRARY = f"{ROCM_DIR}/lib/librocprofiler-sdk-roctx.so"
@@ -884,6 +903,22 @@ def run(app_args, args, **kwargs):
f"{ROCM_DIR}/libexec/rocprofiler-sdk/librocprofv3-list-avail.so"
)
def resolve_path(val):
if not os.path.exists(val):
fatal_error(f"{val} does not exist")
if os.path.islink(val):
if args.readlink:
val = os.path.abspath(os.readlink(val))
if args.realpath:
val = os.path.realpath(val)
return val
ROCPROF_TOOL_LIBRARY = resolve_path(ROCPROF_TOOL_LIBRARY)
ROCPROF_SDK_LIBRARY = resolve_path(ROCPROF_SDK_LIBRARY)
ROCPROF_ROCTX_LIBRARY = resolve_path(ROCPROF_ROCTX_LIBRARY)
ROCPROF_KOKKOSP_LIBRARY = resolve_path(ROCPROF_KOKKOSP_LIBRARY)
ROCPROF_LIST_AVAIL_TOOL_LIBRARY = resolve_path(ROCPROF_LIST_AVAIL_TOOL_LIBRARY)
prepend_preload = [itr for itr in args.preload if itr]
append_preload = [
ROCPROF_TOOL_LIBRARY,
Обычный файл → Исполняемый файл
+12 -1
Просмотреть файл
@@ -69,10 +69,21 @@ class pc_config:
self.max_interval = max_interval
ROCPROFV3_AVAIL_DIR = os.path.dirname(os.path.realpath(__file__))
ROCM_DIR = os.path.dirname(ROCPROFV3_AVAIL_DIR)
ROCPROF_LIST_AVAIL_TOOL_LIBRARY = (
f"{ROCM_DIR}/libexec/rocprofiler-sdk/librocprofv3-list-avail.so"
)
MAX_STR = 256
libname = os.environ.get("ROCPROF_LIST_AVAIL_TOOL_LIBRARY")
libname = os.environ.get(
"ROCPROF_LIST_AVAIL_TOOL_LIBRARY", ROCPROF_LIST_AVAIL_TOOL_LIBRARY
)
c_lib = ctypes.CDLL(libname)
if c_lib is None:
fatal_error(f"Error opening {libname}")
c_lib.get_number_of_counters.restype = ctypes.c_ulong
c_lib.get_number_of_pc_sample_configs.restype = ctypes.c_ulong
c_lib.get_number_of_dimensions.restype = ctypes.c_ulong
+12 -11
Просмотреть файл
@@ -24,6 +24,8 @@
#include "lib/common/environment.hpp"
#include "lib/common/units.hpp"
#include <fmt/format.h>
#include <sys/mman.h>
#include <atomic>
#include <cerrno>
@@ -71,9 +73,8 @@ ring_buffer::operator=(ring_buffer&& rhs) noexcept
void
ring_buffer::init(size_t _size)
{
if(m_init)
throw std::runtime_error("rocprofiler::common::container::base::ring_buffer::init(size_t) "
":: already initialized");
ROCP_FATAL_IF(m_init)
<< "rocprofiler::common::container::base::ring_buffer::init(size_t) :: already initialized";
m_init = true;
@@ -85,9 +86,10 @@ ring_buffer::init(size_t _size)
if((_size % units::get_page_size()) > 0)
{
std::ostringstream _oss{};
_oss << "Error! size is not a multiple of page size: " << _size << " % "
<< units::get_page_size() << " = " << (_size % units::get_page_size());
throw std::runtime_error(_oss.str());
ROCP_FATAL << fmt::format("Error! size is not a multiple of page size: {} % {} = {}",
_size,
units::get_page_size(),
(_size % units::get_page_size()));
}
m_size = _size;
@@ -101,7 +103,7 @@ ring_buffer::init(size_t _size)
{
destroy();
auto _err = errno;
throw std::runtime_error(strerror(_err));
ROCP_FATAL << fmt::format("mmap failed with errno {} :: {}", _err, strerror(_err));
}
}
@@ -256,10 +258,9 @@ ring_buffer::can_clear() const
bool
ring_buffer::clear()
{
if(!can_clear())
throw std::runtime_error(
"ring_buffer does not permit invoking clear() member function when the read "
"pointer is non-zero because this introduces thread-safety issues");
ROCP_CI_LOG_IF(WARNING, !can_clear())
<< "ring_buffer does not permit invoking clear() member function when the read pointer is "
"non-zero because this introduces thread-safety issues";
m_write_count.store(0, std::memory_order_release);
return true;
+36 -40
Просмотреть файл
@@ -22,6 +22,9 @@
#include "lib/common/elf_utils.hpp"
#include <rocprofiler-sdk/cxx/utility.hpp>
#include <fmt/format.h>
#include <elfio/elfio.hpp>
#include <sys/stat.h>
@@ -47,22 +50,14 @@ namespace
{
const ELFIO::Elf_Xword PAGE_SIZE = sysconf(_SC_PAGESIZE);
template <typename Tp>
std::string
as_hex_string(Tp&& _v, size_t _w = 16)
{
auto _ss = std::stringstream{};
_ss.fill('0');
_ss << "0x" << std::hex << std::setw(_w) << std::forward<Tp>(_v);
return _ss.str();
}
using ::rocprofiler::sdk::utility::as_hex;
} // namespace
SymbolEntry::SymbolEntry(unsigned int _idx, const accessor_type& _accessor)
: index{_idx}
{
if(!_accessor.get_symbol(index, name, value, size, bind, type, section_index, other))
throw std::runtime_error("Error in ELFIO::symbol_section_accessor::get_symbol");
ROCP_WARNING << "ELFIO::symbol_section_accessor::get_symbol failed of symbol " << _idx;
}
DynamicEntry::DynamicEntry(unsigned int _idx, const accessor_type& _accessor)
@@ -75,7 +70,7 @@ RelocationEntry::RelocationEntry(unsigned int _idx, const accessor_type& _access
: index{_idx}
{
if(!_accessor.get_entry(_idx, offset, symbol, type, addend))
throw std::runtime_error("Error in ELFIO::relocation_section_accessor::get_entry");
ROCP_WARNING << "ELFIO::relocation_section_accessor::get_entry failed for symbol " << _idx;
}
ElfInfo::ElfInfo(std::string _fname)
@@ -111,24 +106,25 @@ read(const std::string& _inp)
ROCP_TRACE << "\nReading " << _inp;
if(!reader.load(_inp)) throw std::runtime_error("Could not load elf file " + _inp);
if(!reader.load(_inp))
ROCP_WARNING << fmt::format("ELF parsing for '{}' did not succeed", _inp);
if(reader.get_class() == ELFIO::ELFCLASS32)
ROCP_TRACE << "ELF 32-bit";
ROCP_TRACE << " - ELF 32-bit";
else
ROCP_TRACE << "ELF 64-bit";
ROCP_TRACE << " - ELF 64-bit";
ROCP_TRACE << "ELF file encoding: "
ROCP_TRACE << " - ELF file encoding: "
<< ((reader.get_encoding() == ELFIO::ELFDATA2LSB) ? std::string_view{"Little endian"}
: std::string_view{"Big endian"});
ROCP_TRACE << "ELF version: " << reader.get_elf_version();
ROCP_TRACE << "ELF header size: " << reader.get_header_size();
ROCP_TRACE << "ELF OS ABI: " << reader.get_os_abi();
ROCP_TRACE << " - ELF version: " << reader.get_elf_version();
ROCP_TRACE << " - ELF header size: " << reader.get_header_size();
ROCP_TRACE << " - ELF OS ABI: " << reader.get_os_abi();
// Print ELF file sections info
ELFIO::Elf_Half sec_num = reader.sections.size();
ROCP_TRACE << "Number of sections: " << sec_num;
ROCP_TRACE << " - Number of sections: " << sec_num;
for(ELFIO::Elf_Half j = 0; j < sec_num; ++j)
{
@@ -143,85 +139,85 @@ read(const std::string& _inp)
for(ELFIO::Elf_Half j = 0; j < sec_num; ++j)
{
Section* psec = sections.at(j);
ROCP_TRACE << " [" << j << "] \t" << std::setw(20) << psec->get_name() << "\t : \t"
ROCP_TRACE << " [" << j << "] \t" << std::setw(20) << psec->get_name() << "\t : \t"
<< "size / entry-size = " << std::setw(6) << psec->get_size() << " / "
<< std::setw(3) << psec->get_entry_size()
<< " | addr: " << as_hex_string(psec->get_address())
<< " | offset: " << as_hex_string(psec->get_offset());
<< " | addr: " << as_hex(psec->get_address(), 16)
<< " | offset: " << as_hex(psec->get_offset(), 16);
if(psec->get_size() == 0) continue;
if(psec->get_type() == ELFIO::SHT_SYMTAB)
{
const ELFIO::symbol_section_accessor _symbols(reader, psec);
ROCP_TRACE << " Number of symbol entries: " << _symbols.get_symbols_num();
ROCP_TRACE << " - Number of symbol entries: " << _symbols.get_symbols_num();
for(ELFIO::Elf_Xword k = 0; k < _symbols.get_symbols_num(); ++k)
symbol_entries.emplace_back(k, _symbols);
}
else if(psec->get_type() == ELFIO::SHT_DYNSYM)
{
const ELFIO::symbol_section_accessor _symbols(reader, psec);
ROCP_TRACE << " Number of dynamic symbol entries: " << _symbols.get_symbols_num();
ROCP_TRACE << " - Number of dynamic symbol entries: " << _symbols.get_symbols_num();
for(ELFIO::Elf_Xword k = 0; k < _symbols.get_symbols_num(); ++k)
dynamic_symbol_entries.emplace_back(k, _symbols);
}
else if(psec->get_type() == ELFIO::SHT_DYNAMIC)
{
const ELFIO::dynamic_section_accessor dynamic{reader, psec};
ROCP_TRACE << " Number of dynamic entries: " << dynamic.get_entries_num();
ROCP_TRACE << " - Number of dynamic entries: " << dynamic.get_entries_num();
for(ELFIO::Elf_Xword k = 0; k < dynamic.get_entries_num(); ++k)
dynamic_entries.emplace_back(k, dynamic);
}
else if(psec->get_type() == ELFIO::SHT_REL || psec->get_type() == ELFIO::SHT_RELA)
{
const ELFIO::relocation_section_accessor reloc{reader, psec};
ROCP_TRACE << " Number of relocation entries: " << reloc.get_entries_num();
ROCP_TRACE << " - Number of relocation entries: " << reloc.get_entries_num();
for(ELFIO::Elf_Xword k = 0; k < reloc.get_entries_num(); ++k)
reloc_entries.emplace_back(k, reloc);
}
}
ROCP_TRACE << "Symbols:";
ROCP_TRACE << " - Symbols:";
for(size_t k = 0; k < symbol_entries.size(); ++k)
{
if(!symbol_entries.at(k).name.empty())
ROCP_TRACE << " [" << k << "] " << symbol_entries.at(k).name;
ROCP_TRACE << " [" << k << "] " << symbol_entries.at(k).name;
}
ROCP_TRACE << "Dynamic Symbols:";
ROCP_TRACE << " - Dynamic Symbols:";
for(size_t k = 0; k < dynamic_symbol_entries.size(); ++k)
{
if(!dynamic_symbol_entries.at(k).name.empty())
ROCP_TRACE << " [" << k << "] " << dynamic_symbol_entries.at(k).name;
ROCP_TRACE << " [" << k << "] " << dynamic_symbol_entries.at(k).name;
}
ROCP_TRACE << "Dynamic entries:";
ROCP_TRACE << " - Dynamic entries:";
for(size_t k = 0; k < dynamic_entries.size(); ++k)
{
if(!dynamic_entries.at(k).name.empty())
ROCP_TRACE << " [" << k << "] " << dynamic_entries.at(k).name;
ROCP_TRACE << " [" << k << "] " << dynamic_entries.at(k).name;
}
ROCP_TRACE << "Relocation entries:";
ROCP_TRACE << " - Relocation entries:";
for(size_t k = 0; k < reloc_entries.size(); ++k)
{
auto _sym_idx = reloc_entries.at(k).symbol;
auto _name = std::string{};
if(_sym_idx < symbol_entries.size()) _name = symbol_entries.at(_sym_idx).name;
if(!_name.empty()) ROCP_TRACE << " [" << k << "] " << _name;
if(!_name.empty()) ROCP_TRACE << " [" << k << "] " << _name;
}
// Print ELF file segments info
ELFIO::Elf_Half seg_num = reader.segments.size();
ROCP_TRACE << "Number of segments: " << seg_num;
ROCP_TRACE << " - Number of segments: " << seg_num;
for(ELFIO::Elf_Half j = 0; j < seg_num; ++j)
{
const ELFIO::segment* pseg = reader.segments[j];
ROCP_TRACE << " [" << std::setw(2) << j << "] flags: " << as_hex_string(pseg->get_flags())
<< " offset: " << as_hex_string(pseg->get_offset())
<< " align: " << as_hex_string(pseg->get_align())
<< " virt: " << as_hex_string(pseg->get_virtual_address())
<< " phys: " << as_hex_string(pseg->get_physical_address())
ROCP_TRACE << " [" << std::setw(2) << j << "] flags: " << as_hex(pseg->get_flags(), 16)
<< " offset: " << as_hex(pseg->get_offset(), 16)
<< " align: " << as_hex(pseg->get_align(), 16)
<< " virt: " << as_hex(pseg->get_virtual_address(), 16)
<< " phys: " << as_hex(pseg->get_physical_address(), 16)
<< " fsize: " << std::setw(8) << pseg->get_file_size()
<< " msize: " << std::setw(8) << pseg->get_memory_size();
}
+3 -2
Просмотреть файл
@@ -24,6 +24,8 @@
#include "lib/common/demangle.hpp"
#include "lib/common/logging.hpp"
#include <fmt/format.h>
#include <cctype>
#include <cstdint>
#include <cstdio>
@@ -62,8 +64,7 @@ get_env(std::string_view env_id, bool _default)
{
if(std::string_view{env_var}.empty())
{
throw std::runtime_error(std::string{"No boolean value provided for "} +
std::string{env_id});
ROCP_FATAL << fmt::format("No boolean value provided for {}", env_id);
}
if(std::string_view{env_var}.find_first_not_of("0123456789") == std::string_view::npos)
+2 -2
Просмотреть файл
@@ -23,6 +23,7 @@
#pragma once
#include "lib/common/defines.hpp"
#include "lib/common/logging.hpp"
#include <cstddef>
#include <cstdint>
@@ -68,8 +69,7 @@ public:
{
if(!(m_addrs.empty() && m_blocks.empty()))
{
throw std::runtime_error{"cannot call pool::rebind() after alloc"};
::abort();
ROCP_FATAL << "cannot call pool::rebind() after alloc";
}
m_size = size;
+8 -7
Просмотреть файл
@@ -124,21 +124,16 @@ write_json(json_output& json_ar,
auto code_object_load_info = tool_metadata.get_code_object_load_info();
auto att_filenames = tool_metadata.get_att_filenames();
auto code_object_snapshot_filenames = std::vector<std::string>{};
code_object_snapshot_filenames.reserve(code_object_load_info.size());
for(const auto& info : code_object_load_info)
{
code_object_snapshot_filenames.emplace_back(fs::path(info.name).filename());
}
json_ar.setNextName("strings");
json_ar.startNode();
json_ar(cereal::make_nvp("callback_records", callback_name_info));
json_ar(cereal::make_nvp("buffer_records", buffer_name_info));
json_ar(cereal::make_nvp("marker_api", marker_msg_data));
json_ar(
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
json_ar(cereal::make_nvp("att_filenames", att_filenames));
json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames));
{
auto _extern_corr_id_strings = std::map<size_t, std::string>{};
if(cfg.kernel_rename)
@@ -166,6 +161,12 @@ write_json(json_output& json_ar,
json_ar.finishNode();
}
json_ar(
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
json_ar(cereal::make_nvp("att_filenames", att_filenames));
json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames));
json_ar.finishNode();
}
+1 -1
Просмотреть файл
@@ -256,7 +256,7 @@ metadata::get_att_filenames() const
{
for(const auto& file : filenames.second.second)
{
data.emplace_back(fs::path(file).filename());
data.emplace_back(fs::path{file}.filename());
}
}
return data;
+4 -2
Просмотреть файл
@@ -1007,9 +1007,11 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
if(cur_header == nullptr)
{
throw std::runtime_error{
"rocprofiler provided a null pointer to header. this should never happen"};
ROCP_CI_LOG(WARNING) << "rocprofiler provided a null pointer to buffer record header. "
"this should never happen";
continue;
}
else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING)
{
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE)
+51 -28
Просмотреть файл
@@ -207,11 +207,17 @@ read_file(const std::string& fname)
auto data = std::vector<std::string>{};
if(!is_readable(fs::path{fname}))
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
{
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
return data;
}
auto ifs = std::ifstream{fname};
if(!ifs || !ifs.good())
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
{
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
return data;
}
while(true)
{
@@ -231,11 +237,17 @@ read_map(const std::string& fname)
auto data = std::unordered_map<std::string, std::string>{};
if(!is_readable(fs::path{fname}))
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
{
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
return data;
}
auto ifs = std::ifstream{fname};
if(!ifs || !ifs.good())
throw std::runtime_error{fmt::format("file '{}' cannot be read", fname)};
{
ROCP_CI_LOG(WARNING) << fmt::format("file '{}' cannot be read", fname);
return data;
}
auto last_label = std::string{};
while(true)
@@ -247,17 +259,23 @@ read_map(const std::string& fname)
auto entry = std::string{};
ifs >> entry;
if(ifs.eof())
throw std::runtime_error{
fmt::format("unexpected file format in '{}' at {}", fname, label)};
{
ROCP_CI_LOG(WARNING) << fmt::format(
"unexpected file format in '{}' at {}", fname, label);
continue;
}
auto ret = data.emplace(label, entry);
if(!ret.second)
throw std::runtime_error{
fmt::format("duplicate entry in '{}': '{}' (='{}'). last label was '{}'",
fname,
label,
entry,
last_label)};
{
ROCP_CI_LOG(WARNING) << fmt::format(
"duplicate entry in '{}': '{}' (='{}'). last label was '{}'",
fname,
label,
entry,
last_label);
continue;
}
if(!label.empty()) last_label = std::move(label);
}
@@ -305,21 +323,22 @@ read_property(const MapT& data, const std::string& label, Tp& value)
constexpr auto max_value = std::numeric_limits<Tp>::max();
if(local_value < min_value)
{
throw std::runtime_error{
fmt::format("data with label {} has a value (={}) which is less "
"than the min value for the type (={})",
label,
local_value,
min_value)};
ROCP_CI_LOG(WARNING) << fmt::format(
"data with label {} has a value (={}) which is less "
"than the min value for the type (={})",
label,
local_value,
min_value);
return;
}
else if(local_value > max_value)
{
throw std::runtime_error{fmt::format("data with label {} has a value (={}) which is "
"greater "
"than the max value for the type (={})",
label,
local_value,
max_value)};
ROCP_CI_LOG(WARNING) << fmt::format("data with label {} has a value (={}) which is "
"greater than the max value for the type (={})",
label,
local_value,
max_value);
return;
}
if constexpr(std::is_const<Tp>::value)
@@ -544,13 +563,17 @@ using unique_agent_t = std::unique_ptr<rocprofiler_agent_t, void (*)(rocprofiler
auto
read_topology()
{
auto sysfs_nodes_path = fs::path{"/sys/class/kfd/kfd/topology/nodes/"};
auto data = std::vector<unique_agent_t>{};
const auto sysfs_nodes_path = fs::path{"/sys/class/kfd/kfd/topology/nodes"};
if(!fs::exists(sysfs_nodes_path))
throw std::runtime_error{
fmt::format("sysfs nodes path '{}' does not exist", sysfs_nodes_path.string())};
{
ROCP_CI_LOG(WARNING) << fmt::format("sysfs nodes path '{}' does not exist",
sysfs_nodes_path.string());
return data;
}
const auto& cpu_info_v = get_cpu_info();
auto data = std::vector<unique_agent_t>{};
uint64_t idcount = 0;
uint64_t nodecount = 0;
uint64_t cpucount = 0;
+6 -5
Просмотреть файл
@@ -124,11 +124,12 @@ rocprofiler::buffer::instance::emplace(uint32_t category, uint32_t kind, Tp& val
{
if(buffers.at(idx).capacity() < sizeof(value))
{
auto msg = std::stringstream{};
msg << "buffer " << buffer_id << " to small (size=" << buffers.at(idx).capacity()
<< ") to hold an object of type " << common::cxx_demangle(typeid(value).name())
<< " with size " << sizeof(value);
throw std::runtime_error(msg.str());
ROCP_CI_LOG(ERROR) << "buffer " << buffer_id
<< " too small (size=" << buffers.at(idx).capacity()
<< ") to hold an object of type "
<< common::cxx_demangle(typeid(value).name()) << " with size "
<< sizeof(value);
return false;
}
if(policy == ROCPROFILER_BUFFER_POLICY_LOSSLESS)
+8 -16
Просмотреть файл
@@ -91,18 +91,14 @@ init_cpu_pool(const AmdExtTable& api, rocprofiler::hsa::AgentCache& agent)
auto status =
api.hsa_amd_agent_iterate_memory_pools_fn(agent.near_cpu(), FindStandardPool, &params);
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
{
throw std::runtime_error("Error: Command Buffer Pool is not initialized");
}
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "HSA Command Buffer Pool is not initialized";
params.second = &agent.kernarg_pool();
status =
api.hsa_amd_agent_iterate_memory_pools_fn(agent.near_cpu(), FindKernArgPool, &(params));
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
{
throw std::runtime_error("Error: Output Buffer Pool is not initialized");
}
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "HSA Output Buffer Pool is not initialized";
}
void
@@ -113,10 +109,8 @@ init_gpu_pool(const AmdExtTable& api, rocprofiler::hsa::AgentCache& agent)
auto status =
api.hsa_amd_agent_iterate_memory_pools_fn(agent.get_hsa_agent(), FindStandardPool, &params);
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
{
throw std::runtime_error("Error: GPU Pool is not initialized");
}
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "HSA GPU Pool is not initialized";
}
} // namespace
@@ -153,10 +147,8 @@ AgentCache::init_device_counting_service_queue(const CoreApiTable& api,
UINT32_MAX,
UINT32_MAX,
&m_profile_queue);
if(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
{
throw std::runtime_error("Error: Queue is not initialized");
}
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "HSA Queue is not initialized";
CHECK(ext.hsa_amd_queue_set_priority_fn) << "no hsa_amd_queue_set_priority_fn in api table";
ext.hsa_amd_queue_set_priority_fn(m_profile_queue, HSA_AMD_QUEUE_PRIORITY_HIGH);
+12 -7
Просмотреть файл
@@ -424,8 +424,8 @@ parse_event<ROCPROFILER_PAGE_MIGRATION_DROPPED_EVENT>(std::string_view str)
template <>
page_migration_record_t parse_event<ROCPROFILER_PAGE_MIGRATION_NONE>(std::string_view)
{
throw std::runtime_error(
"ROCPROFILER_PAGE_MIGRATION_NONE for parsing page migration events should not happen");
ROCP_CI_LOG(WARNING)
<< "ROCPROFILER_PAGE_MIGRATION_NONE for parsing page migration events should not happen";
}
template <size_t OpInx, size_t... OpInxs>
@@ -648,10 +648,11 @@ struct poll_kfd_t
[&]() {
const auto retcode = pipe2(&thread_pipes[0], DEFAULT_FLAGS);
if(retcode != 0)
throw std::runtime_error{
fmt::format("Pipe creation for thread notify failed with {} code\n", retcode)};
const auto _err = errno;
ROCP_FATAL_IF(retcode != 0)
<< fmt::format("Pipe creation for page-migration thread notify returned {} :: {}\n",
retcode,
strerror(_err));
}();
thread_notify = pollfd{
@@ -792,7 +793,11 @@ poll_events(small_vector<pollfd> file_handles)
auto poll_ret = poll(file_handles.data(), file_handles.size(), -1);
if(poll_ret == -1)
throw std::runtime_error{"Background thread file descriptors are invalid"};
{
ROCP_CI_LOG(WARNING)
<< "Background thread file descriptors for page-migration are invalid";
return;
}
if((exitfd.revents & POLLIN) != 0)
{
+4 -2
Просмотреть файл
@@ -78,14 +78,16 @@ amd_intercept_marker_handler_callback(const struct amd_aql_intercept_marker_s* p
if(ext_table_->hsa_amd_queue_get_info_fn(queue, HSA_AMD_QUEUE_INFO_AGENT, &hsa_agent) !=
HSA_STATUS_SUCCESS)
{
throw std::runtime_error("Cannot map hsa_queue_t* to hsa_agent_t");
ROCP_CI_LOG(WARNING) << "Cannot map hsa_queue_t* to hsa_agent_t";
return;
}
uint64_t doorbell_id = 0;
if(ext_table_->hsa_amd_queue_get_info_fn(queue, HSA_AMD_QUEUE_INFO_DOORBELL_ID, &doorbell_id) !=
HSA_STATUS_SUCCESS)
{
throw std::runtime_error("Cannot map hsa_queue_t* to doorbell_id");
ROCP_CI_LOG(WARNING) << "Cannot map hsa_queue_t* to doorbell id";
return;
}
auto internal_correlation = packet->user_data[0];
+9 -6
Просмотреть файл
@@ -21,12 +21,12 @@
// SOFTWARE.
#include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.hpp"
#include "lib/rocprofiler-sdk/details/kfd_ioctl.h"
#include "lib/common/logging.hpp"
#include "lib/rocprofiler-sdk/details/kfd_ioctl.h"
#include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter_types.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <sys/ioctl.h>
#include <fcntl.h>
@@ -61,14 +61,15 @@ struct pc_sampling_ioctl_version_t
int
kfd_open()
{
int fd = -1;
static const char kfd_device_name[] = "/dev/kfd";
int fd = -1;
constexpr auto* kfd_device_name = "/dev/kfd";
fd = open(kfd_device_name, O_RDWR | O_CLOEXEC);
if(fd == -1)
{
throw std::runtime_error("Cannot open /dev/kfd");
ROCP_CI_LOG(WARNING) << fmt::format("Cannot open {} for pc sampling", kfd_device_name);
return -1;
}
return fd;
@@ -485,6 +486,8 @@ ioctl_pcs_create(const rocprofiler_agent_t* agent,
args.num_sample_info = 1;
args.trace_id = INVALID_TRACE_ID;
if(get_kfd_fd() == -1) return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE;
auto ioctl_ret = ioctl(get_kfd_fd(), AMDKFD_IOC_PC_SAMPLE, &args);
*ioctl_pcs_id = args.trace_id;
+3 -2
Просмотреть файл
@@ -21,6 +21,7 @@
// SOFTWARE.
#include "lib/rocprofiler-sdk/pc_sampling/utils.hpp"
#include "lib/common/logging.hpp"
#include "lib/rocprofiler-sdk/pc_sampling/defines.hpp"
#if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0
@@ -49,7 +50,7 @@ get_matching_hsa_pcs_method(rocprofiler_pc_sampling_method_t method)
case ROCPROFILER_PC_SAMPLING_METHOD_LAST: break;
}
throw std::runtime_error("Illegal pc sampling method\n");
ROCP_FATAL << "Illegal pc sampling method " << method;
}
hsa_ven_amd_pcs_units_t
@@ -66,7 +67,7 @@ get_matching_hsa_pcs_units(rocprofiler_pc_sampling_unit_t unit)
case ROCPROFILER_PC_SAMPLING_UNIT_LAST: break;
}
throw std::runtime_error("Illegal pc sampling units\n");
ROCP_FATAL << "Illegal pc sampling unit " << unit;
}
} // namespace utils
} // namespace pc_sampling
+64 -8
Просмотреть файл
@@ -77,6 +77,7 @@
#include <stdexcept>
#include <string>
#include <string_view>
#include <system_error>
#include <thread>
#include <unordered_set>
#include <vector>
@@ -101,6 +102,49 @@ namespace
{
namespace fs = ::rocprofiler::common::filesystem;
bool
resolved_exists(std::string_view fname)
{
if(fs::is_symlink(fname))
{
// NOTE: Use of ROCP_CI_LOG(WARNING) causes segfault. Likely bc glog is not fully
// initialized
auto _errc = std::error_code{};
auto _symlinked = fs::read_symlink(fname, _errc);
if(_errc && _symlinked.empty())
{
ROCP_WARNING << fmt::format("Symbolic link '{}' returned error code {} :: {}",
fname,
_errc.value(),
_errc.message());
return false;
}
else if(_errc && !_symlinked.empty())
{
ROCP_WARNING << fmt::format("Symbolic link '{}' -> '{}' returned error code {} :: {}",
fname,
_symlinked.string(),
_errc.value(),
_errc.message());
return false;
}
if(_symlinked.is_relative()) _symlinked = fs::path{fname}.parent_path() / _symlinked;
ROCP_TRACE << fmt::format("Symbolic link:\n\t{}\n\t\t-> {}", fname, _symlinked.string());
if(!fs::exists(_symlinked))
{
ROCP_WARNING << fmt::format("{} is broken symbolic link", fname);
return false;
}
return resolved_exists(fs::absolute(_symlinked).string());
}
return fs::exists(fname);
}
// invoke all rocprofiler_configure symbols
bool
invoke_client_configures();
@@ -257,14 +301,17 @@ find_clients()
{
ROCP_INFO << "[ROCP_TOOL_LIBRARIES] searching " << itr << " for rocprofiler_configure";
if(fs::exists(itr))
if(fs::exists(itr) && resolved_exists(itr))
{
auto elfinfo = common::elf_utils::read(itr);
if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"}))
{
ROCP_FATAL << "[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '" << itr
<< "' did not contain rocprofiler_configure symbol (search method: "
"ELF parsing)";
ROCP_CI_LOG(WARNING) << fmt::format(
"[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '{}' did not "
"contain rocprofiler_configure symbol (search method: ELF parsing). "
"Attempting dlopen anyway since the library was explicitly listed in "
"ROCP_TOOL_LIBRARIES",
itr);
}
}
@@ -295,10 +342,10 @@ find_clients()
{
auto _sym = rocprofiler_configure_dlsym(handle);
// FATAL bc they explicitly said this was a tool library
ROCP_FATAL_IF(!_sym)
ROCP_CI_LOG_IF(WARNING, !_sym)
<< "[ROCP_TOOL_LIBRARIES] rocprofiler-sdk tool library '" << itr
<< "' did not contain rocprofiler_configure symbol (search method: dlsym)";
if(is_unique_configure_func(_sym)) emplace_client(itr, handle, _sym);
if(_sym && is_unique_configure_func(_sym)) emplace_client(itr, handle, _sym);
}
}
}
@@ -323,13 +370,22 @@ find_clients()
{
ROCP_INFO << "searching " << itr << " for rocprofiler_configure";
if(fs::exists(itr))
if(fs::exists(itr) && resolved_exists(itr))
{
auto elfinfo = common::elf_utils::read(itr);
if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"})) continue;
if(!elfinfo.has_symbol(std::regex{"^rocprofiler_configure$"}))
{
ROCP_INFO << fmt::format(
"Shared library '{}' did not contain the 'rocprofiler_configure' symbol "
"(search method: ELF parsing) required by rocprofiler-sdk for tools",
itr);
continue;
}
}
else
{
ROCP_INFO << fmt::format(
"Shared library '{}' either does not exist or is a broken symbolic link", itr);
continue;
}