[rocprof-sys] Refactor trace_cache architecture with improved type erasure and processing pipeline (#1710)

- Redesigned buffer_storage with a flush_worker pattern for better thread management and resource cleanup
- Introduced type-safe abstractions through new components: cacheable.hpp, cache_type_traits.hpp, sample_processor.hpp, and type_registry.hpp
- Optimized type erasure implementation in sample processor to reduce runtime overhead
- Renamed rocpd_post_processing to rocpd_processor and restructured the processing pipeline
- Removed storage_parser.cpp and integrated functionality into header-based template implementation
- Enhanced cache_manager with improved processing workflow and better separation of concerns
This commit is contained in:
Milan Radosavljevic
2025-11-20 20:18:13 +01:00
committad av GitHub
förälder 2f6fb89c51
incheckning 4d670099fa
27 ändrade filer med 2876 tillägg och 1854 borttagningar
@@ -22,23 +22,27 @@
set(trace_cache_sources
${CMAKE_CURRENT_LIST_DIR}/cache_manager.cpp
${CMAKE_CURRENT_LIST_DIR}/storage_parser.cpp
${CMAKE_CURRENT_LIST_DIR}/buffer_storage.cpp
${CMAKE_CURRENT_LIST_DIR}/metadata_registry.cpp
${CMAKE_CURRENT_LIST_DIR}/rocpd_post_processing.cpp
${CMAKE_CURRENT_LIST_DIR}/rocpd_processor.cpp
)
set(trace_cache_headers
${CMAKE_CURRENT_LIST_DIR}/cache_manager.hpp
${CMAKE_CURRENT_LIST_DIR}/storage_parser.hpp
${CMAKE_CURRENT_LIST_DIR}/buffer_storage.hpp
${CMAKE_CURRENT_LIST_DIR}/cache_utility.hpp
${CMAKE_CURRENT_LIST_DIR}/cacheable.hpp
${CMAKE_CURRENT_LIST_DIR}/type_registry.hpp
${CMAKE_CURRENT_LIST_DIR}/cache_type_traits.hpp
${CMAKE_CURRENT_LIST_DIR}/metadata_registry.hpp
${CMAKE_CURRENT_LIST_DIR}/rocpd_post_processing.hpp
${CMAKE_CURRENT_LIST_DIR}/rocpd_processor.hpp
${CMAKE_CURRENT_LIST_DIR}/sample_processor.hpp
${CMAKE_CURRENT_LIST_DIR}/sample_type.hpp
)
target_sources(
target_sources(rocprofiler-systems-core-library PRIVATE ${trace_cache_sources})
target_include_directories(
rocprofiler-systems-core-library
PRIVATE ${trace_cache_sources} ${trace_cache_headers}
PUBLIC ${CMAKE_CURRENT_LIST_DIR}
)
@@ -40,162 +40,89 @@ namespace rocprofsys
namespace trace_cache
{
namespace
{
constexpr auto CACHE_FILE_FLUSH_TIMEOUT = 10ms;
constexpr auto NUM_OF_THREADS = 1;
} // namespace
flush_worker_t::flush_worker_t(worker_function_t worker_function,
worker_synchronization_ptr_t worker_synchronization_ptr,
std::string filepath)
buffer_storage::buffer_storage()
{
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
m_thread_pool = std::make_unique<PTL::ThreadPool>(NUM_OF_THREADS);
m_thread_pool->initialize_threadpool(NUM_OF_THREADS);
m_task_group = std::make_unique<PTL::TaskGroup<void>>(m_thread_pool.get());
}
: m_worker_function(std::move(worker_function))
, m_worker_synchronization(std::move(worker_synchronization_ptr))
, m_filepath(std::move(filepath))
{}
void
buffer_storage::start_flushing_thread(pid_t _pid)
flush_worker_t::start(const pid_t& current_pid)
{
if(m_worker_synchronization->is_running)
{
std::stringstream _ss;
_ss << "Flush worker is already running";
throw std::runtime_error(_ss.str());
}
m_ofs = std::ofstream{ m_filepath, std::ios::binary | std::ios::out };
if(!m_ofs.good())
{
std::stringstream _ss;
_ss << "Error opening file for writing: " << m_filepath;
throw std::runtime_error(_ss.str());
}
m_worker_synchronization->origin_pid = current_pid;
m_worker_synchronization->is_running = true;
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
m_task_group->exec([this, _pid]() {
auto filepath = get_buffered_storage_filename(get_root_process_id(), getpid());
std::ofstream _ofs(filepath, std::ios::binary | std::ios::out);
if(!_ofs)
{
std::stringstream _ss;
_ss << "Error opening file for writing: " << filepath;
throw std::runtime_error(_ss.str());
}
auto execute_flush = [&](std::ofstream& ofs, bool force = false) {
size_t _head, _tail;
{
std::lock_guard guard{ m_mutex };
_head = m_head;
_tail = m_tail;
if(_head == _tail)
{
return;
}
auto used_space =
m_head > m_tail ? (m_head - m_tail) : (buffer_size - m_tail + m_head);
if(!force && used_space < flush_threshold)
{
return;
}
m_tail = m_head;
}
if(_head > _tail)
{
ofs.write(reinterpret_cast<const char*>(m_buffer->data() + _tail),
_head - _tail);
}
else
{
ofs.write(reinterpret_cast<const char*>(m_buffer->data() + _tail),
buffer_size - _tail);
ofs.write(reinterpret_cast<const char*>(m_buffer->data()), _head);
}
};
m_created_process = _pid;
m_flushing_thread = std::make_unique<std::thread>([&]() {
std::mutex _shutdown_condition_mutex;
while(m_running)
while(m_worker_synchronization->is_running)
{
execute_flush(_ofs);
m_worker_function(m_ofs, false);
std::unique_lock _lock{ _shutdown_condition_mutex };
m_shutdown_condition.wait_for(
_lock, std::chrono::milliseconds(CACHE_FILE_FLUSH_TIMEOUT),
[&]() { return !m_running; });
m_worker_synchronization->is_running_condition.wait_for(
_lock, CACHE_FILE_FLUSH_TIMEOUT,
[&]() { return !m_worker_synchronization->is_running; });
}
execute_flush(_ofs, true);
_ofs.close();
m_exit_finished = true;
m_exit_condition.notify_one();
m_worker_function(m_ofs, true);
m_ofs.close();
m_worker_synchronization->exit_finished = true;
m_worker_synchronization->exit_finished_condition.notify_one();
});
}
buffer_storage::~buffer_storage()
{
shutdown();
if(m_thread_pool && m_thread_pool->is_alive())
{
m_thread_pool->destroy_threadpool();
}
}
void
buffer_storage::shutdown()
flush_worker_t::stop(const pid_t& current_pid)
{
if(!m_running)
const bool flushing_thread_exist = m_flushing_thread != nullptr;
const bool worker_is_running =
m_worker_synchronization != nullptr && m_worker_synchronization->is_running;
if(flushing_thread_exist && worker_is_running)
{
return;
}
ROCPROFSYS_DEBUG("Buffer storage shutting down..\n");
m_worker_synchronization->is_running = false;
m_worker_synchronization->is_running_condition.notify_all();
ROCPROFSYS_DEBUG("Buffer storage shutting down..");
m_running = false;
m_shutdown_condition.notify_all();
if(m_created_process != getpid())
{
ROCPROFSYS_DEBUG(
"Buffer storage is not created in same process as shutting down..");
return;
}
std::mutex _exit_mutex;
std::unique_lock _exit_lock{ _exit_mutex };
m_exit_condition.wait(_exit_lock, [&]() { return m_exit_finished; });
if(m_thread_pool && m_thread_pool->is_alive())
{
m_thread_pool->destroy_threadpool();
}
}
void
buffer_storage::fragment_memory()
{
auto* _data = m_buffer->data();
memset(_data + m_head, 0xFFFF, buffer_size - m_head);
*reinterpret_cast<entry_type*>(_data + m_head) = entry_type::fragmented_space;
size_t remaining_bytes = buffer_size - m_head - minimal_fragmented_memory_size;
*reinterpret_cast<size_t*>(_data + m_head + sizeof(entry_type)) = remaining_bytes;
m_head = 0;
}
uint8_t*
buffer_storage::reserve_memory_space(size_t len)
{
size_t _size;
{
std::lock_guard scope{ m_mutex };
if((m_head + len + minimal_fragmented_memory_size) > buffer_size)
const bool thread_is_created_in_this_process =
current_pid == m_worker_synchronization->origin_pid;
if(!thread_is_created_in_this_process)
{
fragment_memory();
ROCPROFSYS_DEBUG(
"Buffer storage is not created in same process as shutting down..\n");
return;
}
std::mutex _exit_mutex;
std::unique_lock _exit_lock{ _exit_mutex };
m_worker_synchronization->exit_finished_condition.wait(
_exit_lock, [&]() { return m_worker_synchronization->exit_finished.load(); });
if(m_flushing_thread->joinable())
{
m_flushing_thread->join();
m_flushing_thread.reset();
}
_size = m_head;
m_head = m_head + len;
}
auto* _result = m_buffer->data() + _size;
memset(_result, 0, len);
return _result;
}
bool
buffer_storage::is_running() const
{
return m_running;
}
} // namespace trace_cache
@@ -22,21 +22,21 @@
#pragma once
#include "PTL/TaskGroup.hh"
#include "PTL/ThreadPool.hh"
#include "cache_utility.hpp"
#include "sample_type.hpp"
#include <PTL/PTL.hh>
#include "core/trace_cache/cacheable.hpp"
#include "common/defines.h"
#include "core/debug.hpp"
#include <cassert>
#include <condition_variable>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <memory>
#include <mutex>
#include <stdexcept>
#include <stdint.h>
#include <string.h>
#include <type_traits>
#include <unistd.h>
namespace rocprofsys
@@ -44,134 +44,226 @@ namespace rocprofsys
namespace trace_cache
{
class cache_manager;
using ofs_t = std::basic_ostream<char>;
using worker_function_t = std::function<void(ofs_t& ofs, bool force)>;
struct worker_synchronization_t
{
std::condition_variable is_running_condition;
std::atomic_bool is_running{ false };
std::condition_variable exit_finished_condition;
std::atomic_bool exit_finished{ false };
pid_t origin_pid;
};
using worker_synchronization_ptr_t = std::shared_ptr<worker_synchronization_t>;
struct flush_worker_t
{
explicit flush_worker_t(worker_function_t worker_function,
worker_synchronization_ptr_t worker_synchronization_ptr,
std::string filepath);
void start(const pid_t& current_pid);
void stop(const pid_t& current_pid);
private:
worker_function_t m_worker_function;
worker_synchronization_ptr_t m_worker_synchronization;
std::string m_filepath;
std::ofstream m_ofs;
std::unique_ptr<std::thread> m_flushing_thread;
};
struct flush_worker_factory_t
{
using worker_t = flush_worker_t;
flush_worker_factory_t() = delete;
flush_worker_factory_t(flush_worker_factory_t&) = delete;
flush_worker_factory_t& operator=(flush_worker_factory_t&) = delete;
flush_worker_factory_t(flush_worker_factory_t&&) = delete;
flush_worker_factory_t& operator=(flush_worker_factory_t&&) = delete;
static std::shared_ptr<worker_t> get_worker(
worker_function_t worker_function,
const worker_synchronization_ptr_t& worker_synchronization_ptr,
std::string filepath)
{
return std::make_shared<worker_t>(worker_function, worker_synchronization_ptr,
std::move(filepath));
}
};
template <typename WorkerFactory, typename TypeIdentifierEnum>
class buffer_storage
{
public:
static buffer_storage& get_instance();
static_assert(type_traits::is_enum_class_v<TypeIdentifierEnum>,
"TypeIdentifierEnum must be an enum class");
template <typename... T>
void store(entry_type type, T&&... values)
public:
explicit buffer_storage(std::string filepath)
: m_worker{ std::move(
WorkerFactory::get_worker([this](ofs_t& ofs, bool force) { flush(ofs, force); },
m_worker_synchronization, std::move(filepath))) }
{}
~buffer_storage() { shutdown(); }
void start(const pid_t& current_pid = getpid())
{
if(!is_running())
if(m_worker == nullptr)
{
throw std::runtime_error(
"Trying to use buffered storage while it is not running");
"Worker is null - unable to start buffered storage.");
}
if(is_running())
{
return;
}
constexpr bool is_supported_type = (supported_types::is_supported<T> && ...);
static_assert(is_supported_type,
"Supported types are const char*, char*, "
"unsigned long, unsigned int, long, unsigned "
"char, std::vector<unsigned char>, double, and int.");
auto arg_size = get_size(values...);
auto total_size = arg_size + sizeof(type) + sizeof(size_t);
auto* reserved_memory = reserve_memory_space(total_size);
size_t position = 0;
auto store_value = [&](const auto& val) {
using Type = decltype(val);
size_t len = 0;
auto* dest = reserved_memory + position;
if constexpr(std::is_same_v<std::decay_t<Type>, const char*>)
{
len = strlen(val) + 1;
std::memcpy(dest, val, len);
}
else if constexpr(std::is_same_v<std::decay_t<Type>, std::vector<uint8_t>>)
{
size_t elem_count = val.size();
len = elem_count + sizeof(size_t);
std::memcpy(dest, &elem_count, sizeof(size_t));
std::memcpy(dest + sizeof(size_t), val.data(), val.size());
}
else
{
using ClearType = std::decay_t<decltype(val)>;
len = sizeof(ClearType);
*reinterpret_cast<ClearType*>(dest) = val;
}
position += len;
};
store_value(type);
store_value(arg_size);
(store_value(values), ...);
m_worker->start(current_pid);
}
void start_flushing_thread(pid_t pid);
~buffer_storage();
void shutdown(const pid_t& current_pid = getpid())
{
if(m_worker == nullptr)
{
throw std::runtime_error(
"Worker is null - unable to shutdown buffered storage.");
return;
}
if(!is_running())
{
return;
}
m_worker->stop(current_pid);
}
template <typename Type>
auto store(const Type& value)
{
if(m_worker == nullptr || !is_running())
{
throw std::runtime_error(
"Trying to use buffered storage while it is not running");
}
type_traits::check_type<Type, TypeIdentifierEnum>();
using TypeIdentifierEnumUderlayingType =
std::underlying_type_t<TypeIdentifierEnum>;
size_t sample_size = get_size(value);
size_t bytes_to_reserve = header_size<TypeIdentifierEnum> + sample_size;
auto* buf = reserve_memory_space(bytes_to_reserve);
size_t position = 0;
auto type_identifier_value =
static_cast<TypeIdentifierEnumUderlayingType>(Type::type_identifier);
utility::store_value(type_identifier_value, buf, position);
utility::store_value(sample_size, buf, position);
serialize(buf + position, value);
}
ROCPROFSYS_INLINE bool is_running() const
{
return m_worker_synchronization != nullptr &&
m_worker_synchronization->is_running;
}
private:
friend class cache_manager;
buffer_storage();
void shutdown();
bool is_running() const;
void fragment_memory();
uint8_t* reserve_memory_space(size_t len);
template <typename... Types>
struct typelist
void flush(ofs_t& ofs, bool force)
{
template <typename T>
constexpr static bool is_supported =
(std::is_same_v<std::decay_t<T>, Types> || ...);
};
using supported_types = typelist<const char*, char*, uint64_t, int32_t, uint32_t,
std::vector<uint8_t>, uint8_t, int64_t, double>;
template <typename T>
static constexpr bool is_string_literal_v =
std::is_same_v<std::decay_t<T>, const char*> ||
std::is_same_v<std::decay_t<T>, char*>;
template <typename T>
constexpr size_t get_size_impl(T&& val)
{
if constexpr(is_string_literal_v<T>)
size_t _head, _tail;
{
size_t size = 0;
while(val[size] != '\0')
std::lock_guard guard{ m_mutex };
_head = m_head;
_tail = m_tail;
if(_head == _tail)
{
size++;
return;
}
return ++size;
auto used_space =
m_head > m_tail ? (m_head - m_tail) : (buffer_size - m_tail + m_head);
if(!force && used_space < flush_threshold)
{
return;
}
m_tail = m_head;
}
else if constexpr(std::is_same_v<std::decay_t<T>, std::vector<uint8_t>>)
if(_head > _tail)
{
return val.size() + sizeof(size_t);
ofs.write(reinterpret_cast<const char*>(m_buffer->data() + _tail),
_head - _tail);
}
else
{
return sizeof(T);
ofs.write(reinterpret_cast<const char*>(m_buffer->data() + _tail),
buffer_size - _tail);
ofs.write(reinterpret_cast<const char*>(m_buffer->data()), _head);
}
if(ofs.fail())
{
ROCPROFSYS_WARNING(1, "Error flushing buffered storage to file for pid: %d",
m_worker_synchronization->origin_pid);
ROCPROFSYS_CI_THROW(true,
"Error flushing buffered storage to file for pid: %d",
m_worker_synchronization->origin_pid);
}
}
template <typename... T>
constexpr size_t get_size(T&&... val)
void fragment_memory()
{
auto total_size = 0;
((total_size += get_size_impl(val)), ...);
return total_size;
auto* _data = m_buffer->data();
memset(_data + m_head, std::numeric_limits<uint8_t>::max(), buffer_size - m_head);
*reinterpret_cast<TypeIdentifierEnum*>(_data + m_head) =
TypeIdentifierEnum::fragmented_space;
size_t remaining_bytes = buffer_size - m_head - header_size<TypeIdentifierEnum>;
*reinterpret_cast<size_t*>(_data + m_head + sizeof(TypeIdentifierEnum)) =
remaining_bytes;
m_head = 0;
}
ROCPROFSYS_INLINE uint8_t* reserve_memory_space(const size_t& number_of_bytes)
{
size_t _size;
{
std::lock_guard scope{ m_mutex };
if(__builtin_expect((m_head + number_of_bytes +
header_size<TypeIdentifierEnum>) > buffer_size,
0))
{
fragment_memory();
}
_size = m_head;
m_head += number_of_bytes;
}
return m_buffer->data() + _size;
}
private:
std::mutex m_mutex;
std::condition_variable m_exit_condition;
bool m_exit_finished{ false };
bool m_running{ true };
std::condition_variable m_shutdown_condition;
worker_synchronization_ptr_t m_worker_synchronization{
std::make_shared<worker_synchronization_t>()
};
std::unique_ptr<PTL::ThreadPool> m_thread_pool;
std::unique_ptr<PTL::TaskGroup<void>> m_task_group;
size_t m_head{ 0 };
size_t m_tail{ 0 };
std::unique_ptr<buffer_array_t> m_buffer{ std::make_unique<buffer_array_t>() };
pid_t m_created_process;
std::shared_ptr<typename WorkerFactory::worker_t> m_worker;
std::mutex m_mutex;
size_t m_head{ 0 };
size_t m_tail{ 0 };
std::unique_ptr<buffer_array_t> m_buffer{ std::make_unique<buffer_array_t>() };
};
} // namespace trace_cache
@@ -21,14 +21,17 @@
// SOFTWARE.
#include "cache_manager.hpp"
#include "agent_manager.hpp"
#include "core/trace_cache/metadata_registry.hpp"
#include "core/trace_cache/rocpd_processor.hpp"
#include "core/trace_cache/sample_processor.hpp"
#include "core/agent_manager.hpp"
#include "core/config.hpp"
#include "core/trace_cache/storage_parser.hpp"
#include "debug.hpp"
#include "core/debug.hpp"
#include "library/runtime.hpp"
#include "trace_cache/cache_utility.hpp"
#include "trace_cache/metadata_registry.hpp"
#include "trace_cache/rocpd_post_processing.hpp"
#include <algorithm>
#include <memory>
#include <vector>
@@ -37,8 +40,81 @@ namespace rocprofsys
{
namespace trace_cache
{
namespace
namespace data
{
struct cache_files_t
{
std::string buff_storage;
std::string metadata;
inline bool empty() const { return buff_storage.empty() || metadata.empty(); }
};
struct enabled_formats_t
{
bool rocpd = get_use_rocpd();
void print() const
{
constexpr std::pair<const char*, bool enabled_formats_t::*> formats[] = {
{ "rocpd", &enabled_formats_t::rocpd },
};
bool any_enabled = false;
for(const auto& fmt : formats)
any_enabled |= this->*(fmt.second);
if(!any_enabled) return;
bool first = true;
std::stringstream ss;
for(const auto& fmt : formats)
{
if(this->*(fmt.second))
{
if(!first && sizeof(formats) > 1) ss << ", ";
ss << fmt.first;
first = false;
}
}
ROCPROFSYS_PRINT(
"Generating [%s] format(s) with collected data from trace cache. This may "
"take a while..\n",
ss.str().c_str());
}
};
struct processor_config_t
{
processor_config_t(pid_t pid, pid_t ppid,
std::shared_ptr<metadata_registry> metadata_registry_ptr,
std::shared_ptr<agent_manager> agent_manager_ptr)
: _pid(pid)
, _ppid(ppid)
, _metadata_registry(std::move(metadata_registry_ptr))
, _agent_manager(std::move(agent_manager_ptr))
{}
pid_t _pid;
pid_t _ppid;
std::shared_ptr<metadata_registry> _metadata_registry;
std::shared_ptr<agent_manager> _agent_manager;
};
struct processor_storage_t
{
std::shared_ptr<rocpd_processor_t> rocpd_processor{ nullptr };
};
using directory_files_t = std::vector<std::string>;
using mapped_cache_files_t = std::map<pid_t, cache_files_t>;
} // namespace data
namespace filesystem_utils
{
void
remove_if_exists(const std::string& fname)
{
@@ -64,19 +140,29 @@ remove_if_exists(const std::string& fname)
}
}
std::vector<std::string>
list_dir_files(const std::string& path)
data::directory_files_t
list_dir_files(const std::string& _path)
{
DIR* dir = opendir(path.c_str());
if(dir == nullptr)
if(_path.empty())
{
ROCPROFSYS_THROW("Error opening directory: %s", path.c_str());
return {};
}
std::vector<std::string> result{};
dirent* entry;
auto dir_deleter = [](DIR* d) {
if(d) closedir(d);
};
while((entry = readdir(dir)) != nullptr)
std::unique_ptr<DIR, decltype(dir_deleter)> dir(opendir(_path.c_str()), dir_deleter);
if(!dir)
{
ROCPROFSYS_THROW("Error opening directory: %s", _path.c_str());
}
data::directory_files_t result{};
dirent* entry;
while((entry = readdir(dir.get())) != nullptr)
{
if(std::string(entry->d_name) != "." && std::string(entry->d_name) != "..")
{
@@ -84,25 +170,26 @@ list_dir_files(const std::string& path)
}
}
closedir(dir);
return result;
}
struct cache_files
data::mapped_cache_files_t
get_cache_files(const pid_t& root_pid,
const data::directory_files_t& _files_from_temp_directory)
{
std::string buff_storage;
std::string metadata;
};
if(_files_from_temp_directory.empty())
{
return {};
}
std::map<pid_t, cache_files>
get_cache_files()
{
const auto root_pid = get_root_process_id();
const auto tmp_files = list_dir_files("/tmp/");
std::map<int, cache_files> cache_map{};
data::mapped_cache_files_t cache_map{};
auto parse_and_fill_cache = [&](const std::string& filename) {
if(filename.empty())
{
return;
}
const std::regex buff_regex(R"(buffered_storage_(\d+)_(\d+)\.bin)");
const std::regex meta_regex(R"(metadata_(\d+)_(\d+)\.json)");
std::smatch match;
@@ -113,7 +200,7 @@ get_cache_files()
int pid = std::stoi(match[2]);
if(parent_pid == root_pid)
{
cache_map[pid].buff_storage = "/tmp/" + filename;
cache_map[pid].buff_storage = trace_cache::tmp_directory + filename;
}
}
else if(std::regex_match(filename, match, meta_regex))
@@ -122,16 +209,119 @@ get_cache_files()
int pid = std::stoi(match[2]);
if(parent_pid == root_pid)
{
cache_map[pid].metadata = "/tmp/" + filename;
cache_map[pid].metadata = trace_cache::tmp_directory + filename;
}
}
};
std::for_each(tmp_files.begin(), tmp_files.end(), parse_and_fill_cache);
std::for_each(_files_from_temp_directory.begin(), _files_from_temp_directory.end(),
parse_and_fill_cache);
return cache_map;
}
} // namespace
void
clear_cache_files(const data::mapped_cache_files_t& _cache_files)
{
ROCPROFSYS_PRINT("Removing cached temporary files...\n");
for(const auto& [_, files] : _cache_files)
{
ROCPROFSYS_DEBUG("Removing cached temporary file: %s\n",
files.buff_storage.c_str());
filesystem_utils::remove_if_exists(files.buff_storage);
ROCPROFSYS_DEBUG("Removing cached temporary file: %s\n", files.metadata.c_str());
filesystem_utils::remove_if_exists(files.metadata);
}
}
} // namespace filesystem_utils
namespace processing_utils
{
[[nodiscard]] data::processor_storage_t
configure_processors(const std::shared_ptr<sample_processor_t>& _type_processing,
const std::shared_ptr<data::processor_config_t>& _processor_config,
const data::enabled_formats_t& _enabled_formats)
{
data::processor_storage_t processor_storage;
if(_enabled_formats.rocpd)
{
processor_storage.rocpd_processor = std::make_shared<rocpd_processor_t>(
_processor_config->_metadata_registry, _processor_config->_agent_manager,
_processor_config->_pid, _processor_config->_ppid);
_type_processing->add_handler(*processor_storage.rocpd_processor);
}
return processor_storage;
}
void
process_buffered_storage(
const std::shared_ptr<data::processor_config_t>& _processor_config,
const std::string& _storage_filename, const data::enabled_formats_t& _enabled_formats)
{
auto _processor_coordinator = std::make_shared<sample_processor_t>();
auto processor_storage =
configure_processors(_processor_coordinator, _processor_config, _enabled_formats);
storage_parser_t _parser(_storage_filename);
_processor_coordinator->prepare_for_processing();
_parser.load(_processor_coordinator);
_processor_coordinator->finalize_processing();
}
std::vector<std::shared_ptr<data::processor_config_t>>
create_processor_configs(const data::mapped_cache_files_t& _cache_files,
const pid_t& _root_pid)
{
constexpr size_t ROOT_PROCESS_INCREMENT{ 1 };
std::vector<std::shared_ptr<data::processor_config_t>> processor_configs;
processor_configs.reserve(_cache_files.size() + ROOT_PROCESS_INCREMENT);
for(const auto& [pid, files] : _cache_files)
{
if(files.empty())
{
continue;
}
std::vector<std::shared_ptr<agent>> _agents;
auto _metadata = std::make_shared<metadata_registry>();
_metadata->load_from_file(files.metadata, _agents);
auto _agent_manager = std::make_shared<agent_manager>(_agents);
processor_configs.push_back(std::make_shared<data::processor_config_t>(
pid, _root_pid, _metadata, _agent_manager));
}
return processor_configs;
}
void
multithreaded_processing(
const std::vector<std::shared_ptr<data::processor_config_t>>& _processor_configs,
const data::enabled_formats_t& _enabled_formats)
{
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
std::vector<std::thread> processing_threads;
processing_threads.reserve(_processor_configs.size());
for(const auto& processor_config : _processor_configs)
{
processing_threads.emplace_back(
process_buffered_storage, processor_config,
utility::get_buffered_storage_filename(processor_config->_ppid,
processor_config->_pid),
_enabled_formats);
}
for(auto& thread : processing_threads)
{
thread.join();
}
}
} // namespace processing_utils
cache_manager&
cache_manager::get_instance()
@@ -143,90 +333,37 @@ cache_manager::get_instance()
void
cache_manager::post_process_bulk()
{
if(is_root_process())
if(!is_root_process())
{
if(m_storage.is_running())
{
ROCPROFSYS_WARNING(2,
"Postprocessing called without previously shutting down "
"cache storage. Calling shutdown explicitly..\n");
shutdown();
}
auto _cache_files = get_cache_files();
if(get_use_rocpd())
{
ROCPROFSYS_PRINT(
"Generating rocpd with collected data. This may take a while..\n");
std::vector<std::thread> rocpd_threads;
ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
rocpd_threads.emplace_back([this]() {
auto pid = getpid();
auto ppid = get_root_process_id();
rocpd_post_processing _post_processing(
m_metadata, get_agent_manager_instance(), pid, ppid);
storage_parser _parser(
get_buffered_storage_filename(get_root_process_id(), getpid()));
_post_processing.register_parser_callback(_parser);
_post_processing.post_process_metadata();
_parser.consume_storage();
});
for(const auto& [pid, files] : _cache_files)
{
if(!files.buff_storage.empty() && !files.metadata.empty())
{
rocpd_threads.emplace_back([pid = pid, files = files]() {
ROCPROFSYS_DEBUG(
"Creating database for [%d] from buffered storage "
"file: %s and from metadata file: %s\n",
pid, files.buff_storage.c_str(), files.metadata.c_str());
std::vector<std::shared_ptr<agent>> _agents;
metadata_registry _metadata;
auto res = _metadata.load_from_file(files.metadata, _agents);
if(!res)
{
ROCPROFSYS_WARNING(0,
"Load from file for metadata failed: %s\n",
files.metadata.c_str());
return;
}
agent_manager _agent_manager{ _agents };
auto ppid = get_root_process_id();
rocpd_post_processing _post_processing(_metadata, _agent_manager,
pid, ppid);
storage_parser _parser(files.buff_storage);
_post_processing.register_parser_callback(_parser);
_post_processing.post_process_metadata();
_parser.consume_storage();
});
}
}
for(auto& thread : rocpd_threads)
{
thread.join();
}
}
ROCPROFSYS_PRINT("Removing cached temporary files...\n");
for(const auto& [pid, files] : _cache_files)
{
ROCPROFSYS_PRINT("Removing cached temporary file: %s\n",
files.buff_storage.c_str());
ROCPROFSYS_PRINT("Removing cached temporary file: %s\n",
files.metadata.c_str());
remove_if_exists(files.buff_storage.c_str());
remove_if_exists(files.metadata.c_str());
}
return;
}
if(m_storage.is_running())
{
ROCPROFSYS_WARNING(2, "Postprocessing called without previously shutting down "
"cache storage. Calling shutdown explicitly..\n");
shutdown();
}
const auto root_pid = get_root_process_id();
const auto temp_directory_content =
filesystem_utils::list_dir_files(trace_cache::tmp_directory);
const auto cache_files =
filesystem_utils::get_cache_files(root_pid, temp_directory_content);
const data::enabled_formats_t enabled_formats;
enabled_formats.print();
auto processor_configs =
processing_utils::create_processor_configs(cache_files, root_pid);
processor_configs.push_back(std::make_shared<data::processor_config_t>(
getpid(), root_pid, m_metadata,
std::make_shared<agent_manager>(get_agent_manager_instance().get_agents())));
processing_utils::multithreaded_processing(processor_configs, enabled_formats);
filesystem_utils::clear_cache_files(cache_files);
}
void
@@ -22,30 +22,43 @@
#pragma once
#include "buffer_storage.hpp"
#include "metadata_registry.hpp"
#include "storage_parser.hpp"
#include "core/trace_cache/buffer_storage.hpp"
#include "core/trace_cache/metadata_registry.hpp"
#include "core/trace_cache/sample_type.hpp"
#include "core/trace_cache/storage_parser.hpp"
#include <memory>
namespace rocprofsys
{
namespace trace_cache
{
using storage_parser_t =
storage_parser<type_identifier_t, kernel_dispatch_sample, memory_copy_sample,
memory_allocate_sample, region_sample, in_time_sample,
pmc_event_with_sample, amd_smi_sample, cpu_freq_sample,
backtrace_region_sample>;
using buffer_storage_t = buffer_storage<flush_worker_factory_t, type_identifier_t>;
class cache_manager
{
public:
static cache_manager& get_instance();
buffer_storage& get_buffer_storage() { return m_storage; }
metadata_registry& get_metadata_registry() { return m_metadata; }
buffer_storage_t& get_buffer_storage() { return m_storage; }
metadata_registry& get_metadata_registry() { return *m_metadata; }
void shutdown();
void post_process_bulk();
private:
void post_process_metadata();
cache_manager() = default;
buffer_storage m_storage;
metadata_registry m_metadata;
buffer_storage_t m_storage{ utility::get_buffered_storage_filename(
get_root_process_id(), getpid()) };
std::shared_ptr<metadata_registry> m_metadata{
std::make_shared<metadata_registry>()
};
};
inline metadata_registry&
@@ -54,7 +67,7 @@ get_metadata_registry()
return cache_manager::get_instance().get_metadata_registry();
}
inline buffer_storage&
inline buffer_storage_t&
get_buffer_storage()
{
return cache_manager::get_instance().get_buffer_storage();
@@ -0,0 +1,173 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <cstdint>
#include <string_view>
#include <tuple>
#include <type_traits>
#include <variant>
#include <vector>
namespace rocprofsys
{
namespace trace_cache
{
namespace type_traits
{
template <typename T>
struct always_false : std::false_type
{};
} // namespace type_traits
template <typename T>
void
serialize(uint8_t*, const T&)
{
static_assert(type_traits::always_false<T>::value, "serialize<T> not specialized");
}
template <typename T>
T
deserialize(uint8_t*&)
{
static_assert(type_traits::always_false<T>::value, "deserialize<T> not specialized");
return T{};
}
template <typename T>
size_t
get_size(const T&)
{
static_assert(type_traits::always_false<T>::value, "get_size(T) not specialized");
return 0;
}
namespace type_traits
{
template <typename T>
struct tuple_to_variant;
template <typename... Types>
struct tuple_to_variant<std::tuple<Types...>>
{
using type = std::variant<Types...>;
};
template <class...>
using void_t = void;
template <typename... Types>
struct typelist
{
template <typename T>
constexpr static bool is_supported = (std::is_same_v<std::decay_t<T>, Types> || ...);
};
using supported_types = typelist<std::string_view, uint64_t, int32_t, uint32_t,
std::vector<uint8_t>, uint8_t, int64_t, double>;
template <typename T>
static constexpr bool is_string_view_v =
std::is_same_v<std::decay_t<T>, std::string_view>;
template <typename T>
struct is_enum_class
: std::bool_constant<std::is_enum_v<T> &&
!std::is_convertible_v<T, std::underlying_type_t<T>>>
{};
template <typename T>
inline constexpr bool is_enum_class_v = is_enum_class<T>::value;
template <typename T, typename TypeIdentifierEnum, typename = void>
struct has_type_identifier : std::false_type
{};
template <class T, typename TypeIdentifierEnum>
struct has_type_identifier<T, TypeIdentifierEnum, void_t<decltype(T::type_identifier)>>
: std::bool_constant<
is_enum_class_v<TypeIdentifierEnum> &&
std::is_convertible_v<decltype(T::type_identifier), TypeIdentifierEnum>>
{};
template <typename T, typename = void>
struct has_serialize : std::false_type
{};
template <typename T>
struct has_serialize<T, std::void_t<decltype(serialize(std::declval<uint8_t*>(),
std::declval<const T&>()))>>
: std::true_type
{};
template <typename T, typename = void>
struct has_deserialize : std::false_type
{};
template <typename T>
struct has_deserialize<
T, void_t<std::is_same<decltype(deserialize<T>(std::declval<uint8_t*&>())), T>>>
: std::true_type
{};
template <typename T, typename = void>
struct has_get_size : std::false_type
{};
template <typename T>
struct has_get_size<T, void_t<decltype(get_size(std::declval<const T&>()))>>
: std::true_type
{};
template <typename T, typename TypeIdentifierEnum>
__attribute__((always_inline)) inline constexpr void
check_type()
{
static_assert(has_serialize<T>::value, "Type doesn't have `serialize` function.");
static_assert(has_deserialize<T>::value, "Type doesn't have `deserialize` function.");
static_assert(has_get_size<T>::value, "Type doesn't have `get_size` function.");
static_assert(has_type_identifier<T, TypeIdentifierEnum>::value,
"Type doesn't have `type_identifier` member with correct type.");
}
template <typename T, typename TypeIdentifierEnum, typename CacheableType,
typename = void>
struct has_execute_processing : std::false_type
{};
template <typename T, typename TypeIdentifierEnum, typename CacheableType>
struct has_execute_processing<
T, TypeIdentifierEnum, CacheableType,
void_t<decltype(std::declval<T>().execute_sample_processing(
std::declval<TypeIdentifierEnum>(), std::declval<const CacheableType&>()))>>
: std::bool_constant<std::is_void_v<decltype(std::declval<T>().execute_sample_processing(
std::declval<TypeIdentifierEnum>(), std::declval<const CacheableType&>()))>>
{};
} // namespace type_traits
} // namespace trace_cache
} // namespace rocprofsys
@@ -0,0 +1,176 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/trace_cache/cache_type_traits.hpp"
#include "library/runtime.hpp"
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdlib>
#include <cstring>
#include <stdint.h>
#include <string>
#include <type_traits>
#include <vector>
using namespace std::chrono_literals;
namespace rocprofsys
{
namespace trace_cache
{
struct cacheable_t
{
cacheable_t() = default;
};
constexpr size_t MByte = 1024 * 1024;
constexpr size_t buffer_size = 100 * MByte;
constexpr size_t flush_threshold = 80 * MByte;
constexpr auto CACHE_FILE_FLUSH_TIMEOUT = 10ms;
constexpr auto ABSOLUTE = "ABS";
constexpr auto PERCENTAGE = "%";
template <typename TypeIdentifierEnum>
constexpr size_t header_size = sizeof(TypeIdentifierEnum) + sizeof(size_t);
using buffer_array_t = std::array<uint8_t, buffer_size>;
const auto tmp_directory = std::string{ "/tmp/" };
namespace utility
{
const auto get_buffered_storage_filename = [](const int& ppid, const int& pid) {
return std::string{ tmp_directory + "buffered_storage_" + std::to_string(ppid) + "_" +
std::to_string(pid) + ".bin" };
};
const auto get_metadata_filepath = [](const int& ppid, const int& pid) {
return std::string{ tmp_directory + "metadata_" + std::to_string(ppid) + "_" +
std::to_string(pid) + ".json" };
};
template <typename Type>
__attribute__((always_inline)) inline constexpr size_t
get_size(Type&& val)
{
using DecayedType = std::decay_t<Type>;
static_assert(type_traits::supported_types::is_supported<DecayedType>,
"Unsupported type in get_size");
if constexpr(type_traits::is_string_view_v<DecayedType> ||
std::is_same_v<DecayedType, std::vector<uint8_t>>)
{
return val.size() + sizeof(size_t);
}
else
{
return sizeof(DecayedType);
}
}
template <typename Type, typename... Types>
__attribute__((always_inline)) inline constexpr size_t
get_size(Type&& val, Types&&... vals)
{
return get_size(std::forward<Type>(val)) + get_size(std::forward<Types>(vals)...);
}
template <typename Type>
__attribute__((always_inline)) inline void
store_value(const Type& value, uint8_t* buffer, size_t& position)
{
using DecayedType = std::decay_t<Type>;
static_assert(type_traits::supported_types::is_supported<DecayedType>,
"Unsupported type in store_value");
auto* dest = buffer + position;
if constexpr(type_traits::is_string_view_v<DecayedType> ||
std::is_same_v<DecayedType, std::vector<uint8_t>>)
{
const size_t elem_count = value.size();
*reinterpret_cast<size_t*>(dest) = elem_count;
std::memcpy(dest + sizeof(size_t), value.data(), elem_count);
position += elem_count + sizeof(size_t);
}
else
{
*reinterpret_cast<DecayedType*>(dest) = value;
position += sizeof(DecayedType);
}
}
template <typename... Types>
__attribute__((always_inline)) inline void
store_value(uint8_t* buffer, const Types&... values)
{
size_t position = 0;
(store_value(values, buffer, position), ...);
}
template <typename Type>
__attribute__((always_inline)) inline static void
parse_value(uint8_t*& data_pos, Type& arg)
{
using DecayedType = std::decay_t<Type>;
static_assert(type_traits::supported_types::is_supported<DecayedType>,
"Unsupported type in parse_value");
if constexpr(type_traits::is_string_view_v<DecayedType>)
{
const size_t string_size = *reinterpret_cast<const size_t*>(data_pos);
data_pos += sizeof(size_t);
arg = std::string_view{ reinterpret_cast<const char*>(data_pos), string_size };
data_pos += string_size;
}
else if constexpr(std::is_same_v<DecayedType, std::vector<uint8_t>>)
{
const size_t vector_size = *reinterpret_cast<const size_t*>(data_pos);
data_pos += sizeof(size_t);
arg.reserve(vector_size);
std::copy_n(data_pos, vector_size, std::back_inserter(arg));
data_pos += vector_size;
}
else
{
arg = *reinterpret_cast<const DecayedType*>(data_pos);
data_pos += sizeof(DecayedType);
}
}
template <typename Type, typename... Types>
__attribute__((always_inline)) inline static void
parse_value(uint8_t*& data_pos, Type& arg, Types&... args)
{
parse_value(data_pos, arg);
parse_value(data_pos, args...);
}
} // namespace utility
} // namespace trace_cache
} // namespace rocprofsys
@@ -38,6 +38,23 @@ namespace trace_cache
namespace
{
class thread_local_string_pool
{
public:
const char* store(std::string_view str)
{
auto [it, inserted] = m_strings.emplace(str);
return it->c_str();
}
void clear() { m_strings.clear(); }
private:
std::set<std::string> m_strings;
};
thread_local thread_local_string_pool g_string_pool;
template <typename ReturnType, typename DataType, typename Filter>
std::optional<ReturnType>
get_type_info(const DataType& data, const Filter& filter)
@@ -215,12 +232,11 @@ from_json_code_object(const nlohmann::json& _json)
rocprofiler_callback_tracing_code_object_load_data_t co = {};
co.code_object_id = _json["code_object_id"].get<long long>();
auto uri_str = _json["uri"].get<std::string>();
co.uri = new char[uri_str.size() + 1];
strncpy(const_cast<char*>(co.uri), uri_str.c_str(), uri_str.size() + 1);
co.load_base = _json["load_base"].get<long long>();
co.load_size = _json["load_size"].get<long long>();
co.load_delta = _json["load_delta"].get<long long>();
co.storage_type = static_cast<rocprofiler_code_object_storage_type_t>(
co.uri = g_string_pool.store(uri_str);
co.load_base = _json["load_base"].get<long long>();
co.load_size = _json["load_size"].get<long long>();
co.load_delta = _json["load_delta"].get<long long>();
co.storage_type = static_cast<rocprofiler_code_object_storage_type_t>(
_json["storage_type"].get<int>());
auto handle = _json["agent_id_handle"].get<long long>();
# if(ROCPROFILER_VERSION >= 600)
@@ -255,12 +271,10 @@ rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t
from_json_kernel_symbol(const nlohmann::json& _json)
{
rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t ks = {};
ks.kernel_id = _json["kernel_id"].get<long long>();
ks.code_object_id = _json["code_object_id"].get<long long>();
auto kernel_name_str = _json["kernel_name"].get<std::string>();
ks.kernel_name = new char[kernel_name_str.size() + 1];
strncpy(const_cast<char*>(ks.kernel_name), kernel_name_str.c_str(),
sizeof(ks.kernel_name) + 1);
ks.kernel_id = _json["kernel_id"].get<long long>();
ks.code_object_id = _json["code_object_id"].get<long long>();
auto kernel_name_str = _json["kernel_name"].get<std::string>();
ks.kernel_name = g_string_pool.store(kernel_name_str);
ks.kernel_object = _json["kernel_object"].get<long long>();
ks.kernarg_segment_size = _json["kernarg_segment_size"].get<int>();
ks.kernarg_segment_alignment = _json["kernarg_segment_alignment"].get<int>();
@@ -843,6 +857,7 @@ metadata_registry::save_to_file(const std::string& filep
std::ofstream file(filepath);
if(!file.is_open())
{
ROCPROFSYS_WARNING(1, "Error opening file for writing: %s", filepath.c_str());
return false;
}
@@ -851,6 +866,7 @@ metadata_registry::save_to_file(const std::string& filep
return true;
} catch(const std::exception& e)
{
ROCPROFSYS_WARNING(1, "Error saving metadata to file: %s", e.what());
return false;
}
}
@@ -864,6 +880,7 @@ metadata_registry::load_from_file(const std::string& filepath,
std::ifstream file(filepath);
if(!file.is_open())
{
ROCPROFSYS_WARNING(1, "Error opening file for reading: %s", filepath.c_str());
return false;
}
@@ -875,6 +892,7 @@ metadata_registry::load_from_file(const std::string& filepath,
return true;
} catch(const std::exception& e)
{
ROCPROFSYS_WARNING(1, "Error loading metadata from file: %s", e.what());
return false;
}
}
@@ -178,9 +178,14 @@ struct kernel_symbol_less
} // namespace info
class cache_manager;
struct metadata_registry
{
metadata_registry();
metadata_registry(const metadata_registry&) = delete;
metadata_registry& operator=(const metadata_registry&) = delete;
metadata_registry(metadata_registry&&) = delete;
metadata_registry& operator=(metadata_registry&&) = delete;
void set_process(const info::process& process);
void add_pmc_info(const info::pmc& pmc_info);
void add_thread_info(const info::thread& thread_info);
@@ -224,26 +229,24 @@ struct metadata_registry
#endif
private:
friend class cache_manager;
metadata_registry();
common::synchronized<info::process> m_process;
common::synchronized<info::process> m_process{};
common::synchronized<
std::unordered_set<info::pmc, info::pmc_info_hash, info::pmc_info_equal>>
m_pmc_infos;
common::synchronized<std::set<info::thread>> m_threads;
common::synchronized<std::set<info::track>> m_tracks;
m_pmc_infos{};
common::synchronized<std::set<info::thread>> m_threads{};
common::synchronized<std::set<info::track>> m_tracks{};
common::synchronized<std::set<uint64_t>> m_streams;
common::synchronized<std::set<uint64_t>> m_queues;
common::synchronized<std::unordered_set<std::string_view>> m_strings;
common::synchronized<std::set<uint64_t>> m_streams{};
common::synchronized<std::set<uint64_t>> m_queues{};
common::synchronized<std::unordered_set<std::string_view>> m_strings{};
#if ROCPROFSYS_USE_ROCM > 0
common::synchronized<std::set<rocprofiler_callback_tracing_code_object_load_data_t,
info::code_object_less>>
m_code_objects;
m_code_objects{};
common::synchronized<
std::set<rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t,
info::kernel_symbol_less>>
m_kernel_symbols;
m_kernel_symbols{};
rocprofiler::sdk::buffer_name_info_t<const char*> m_buffered_tracing_info{
rocprofiler::sdk::get_buffer_tracing_names<const char*>()
};
@@ -1,904 +0,0 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "trace_cache/rocpd_post_processing.hpp"
#include "agent_manager.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "gpu_metrics.hpp"
#include "library/thread_info.hpp"
#include "node_info.hpp"
#include "rocpd/data_processor.hpp"
#include "rocpd/data_storage/database.hpp"
#include "trace_cache/metadata_registry.hpp"
#include "trace_cache/sample_type.hpp"
#include "trace_cache/storage_parser.hpp"
#include <cstdint>
#include <limits>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include <timemory/utility/demangle.hpp>
#if ROCPROFSYS_USE_ROCM > 0
# include "library/rocprofiler-sdk/fwd.hpp"
# include <rocprofiler-sdk/context.h>
# include <rocprofiler-sdk/version.h>
#endif
namespace rocprofsys
{
namespace trace_cache
{
namespace
{
#if ROCPROFSYS_USE_ROCM > 0
auto
get_handle_from_code_object(
const rocprofiler_callback_tracing_code_object_load_data_t& code_object)
{
# if(ROCPROFILER_VERSION >= 600)
return code_object.agent_id.handle;
# else
return code_object.rocp_agent.handle;
# endif
}
#endif
} // namespace
std::shared_ptr<rocpd::data_processor>
rocpd_post_processing::get_data_processor() const
{
return m_data_processor;
}
postprocessing_callback
rocpd_post_processing::get_kernel_dispatch_callback() const
{
return [&]([[maybe_unused]] const storage_parsed_type_base& parsed) {
#if ROCPROFSYS_USE_ROCM > 0
auto _kds = static_cast<const struct kernel_dispatch_sample&>(parsed);
auto data_processor = get_data_processor();
auto& n_info = node_info::get_instance();
auto process = m_metadata.get_process_info();
auto agent_primary_key =
m_agent_manager.get_agent_by_handle(_kds.agent_id_handle).base_id;
auto thread_primary_key =
data_processor->map_thread_id_to_primary_key(_kds.thread_id);
auto category_id = data_processor->insert_string(
trait::name<category::rocm_kernel_dispatch>::value);
auto kernel_symbol = m_metadata.get_kernel_symbol(_kds.kernel_id);
if(!kernel_symbol.has_value())
{
throw std::runtime_error("Kernel symbol is missing for kernel dispatch");
return;
}
auto region_name_primary_key = data_processor->insert_string(
tim::demangle(kernel_symbol->kernel_name).c_str());
auto stack_id = _kds.correlation_id_internal;
auto parent_stack_id = _kds.correlation_id_ancestor;
auto correlation_id = 0;
auto event_id = data_processor->insert_event(category_id, stack_id,
parent_stack_id, correlation_id);
data_processor->insert_kernel_dispatch(
n_info.id, process.pid, thread_primary_key, agent_primary_key, _kds.kernel_id,
_kds.dispatch_id, _kds.queue_id_handle, _kds.stream_handle,
_kds.start_timestamp, _kds.end_timestamp, _kds.private_segment_size,
_kds.group_segment_size, _kds.workgroup_size_x, _kds.workgroup_size_y,
_kds.workgroup_size_z, _kds.grid_size_x, _kds.grid_size_y, _kds.grid_size_z,
region_name_primary_key, event_id);
#endif
};
}
postprocessing_callback
rocpd_post_processing::get_memory_copy_callback() const
{
return [&]([[maybe_unused]] const storage_parsed_type_base& parsed) {
#if ROCPROFSYS_USE_ROCM > 0
auto _mcs = static_cast<const struct memory_copy_sample&>(parsed);
auto data_processor = get_data_processor();
auto& n_info = node_info::get_instance();
auto process = m_metadata.get_process_info();
auto _name = std::string{ m_metadata.get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_mcs.kind),
static_cast<rocprofiler_tracing_operation_t>(_mcs.operation)) };
auto name_primary_key = data_processor->insert_string(_name.c_str());
auto category_primary_key =
data_processor->insert_string(trait::name<category::rocm_memory_copy>::value);
auto thread_primary_key =
data_processor->map_thread_id_to_primary_key(_mcs.thread_id);
auto dst_agent_primary_key =
m_agent_manager.get_agent_by_handle(_mcs.dst_agent_id_handle).base_id;
auto src_agent_primary_key =
m_agent_manager.get_agent_by_handle(_mcs.src_agent_id_handle).base_id;
auto stack_id = _mcs.correlation_id_internal;
auto parent_stack_id = _mcs.correlation_id_ancestor;
auto correlation_id = 0;
auto queue_id = 0;
auto event_primary_key = data_processor->insert_event(
category_primary_key, stack_id, parent_stack_id, correlation_id);
data_processor->insert_memory_copy(
n_info.id, process.pid, thread_primary_key, _mcs.start_timestamp,
_mcs.end_timestamp, name_primary_key, dst_agent_primary_key,
_mcs.dst_address_value, src_agent_primary_key, _mcs.src_address_value,
_mcs.bytes, queue_id, _mcs.stream_handle, name_primary_key,
event_primary_key);
#endif
};
}
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFILER_VERSION >= 600)
postprocessing_callback
rocpd_post_processing::get_memory_allocate_callback() const
{
# if ROCPROFSYS_USE_ROCM > 0
auto memtype_to_db =
[](std::string_view memory_type) -> std::pair<std::string, std::string> {
constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" };
constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" };
constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" };
constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" };
std::string _type;
std::string _level;
if(memory_type.find(MEMORY_PREFIX) == 0)
{
_type = memory_type.substr(MEMORY_PREFIX.length());
if(_type.find(VMEM_PREFIX) == 0)
{
_type = _type.substr(VMEM_PREFIX.length());
_level = "VIRTUAL";
}
else
{
_level = "REAL";
}
}
else if(memory_type.find(SCRATCH_PREFIX) == 0)
{
_type = memory_type.substr(SCRATCH_PREFIX.length());
_level = "SCRATCH";
if(memory_type.find(ASYNC_PREFIX) == 0)
{
_type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM
}
}
if(_type == "ALLOCATE")
{
_type = "ALLOC";
}
return std::make_pair(_type, _level);
};
# endif
return [&]([[maybe_unused]] const storage_parsed_type_base& parsed) {
# if ROCPROFSYS_USE_ROCM > 0
auto _mas = static_cast<const struct memory_allocate_sample&>(parsed);
auto data_processor = get_data_processor();
auto& n_info = node_info::get_instance();
auto process = m_metadata.get_process_info();
auto thread_primary_key =
data_processor->map_thread_id_to_primary_key(_mas.thread_id);
auto agent_primary_key = std::optional<uint64_t>{};
const auto invalid_context = ROCPROFILER_CONTEXT_NONE;
if(_mas.agent_id_handle != invalid_context.handle)
{
{
agent_primary_key =
m_agent_manager.get_agent_by_handle(_mas.agent_id_handle).base_id;
}
const auto* _name = m_metadata.get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_mas.kind),
static_cast<rocprofiler_tracing_operation_t>(_mas.operation));
auto [type, level] = memtype_to_db(_name);
auto stack_id = _mas.correlation_id_internal;
auto parent_stack_id = _mas.correlation_id_ancestor;
auto correlation_id = 0;
auto queue_id = 0;
auto category_primary_key = data_processor->insert_string(
trait::name<category::rocm_memory_allocate>::value);
auto event_primary_key = data_processor->insert_event(
category_primary_key, stack_id, parent_stack_id, correlation_id);
data_processor->insert_memory_alloc(
n_info.id, process.pid, thread_primary_key, agent_primary_key,
type.c_str(), level.c_str(), _mas.start_timestamp, _mas.end_timestamp,
_mas.address_value, _mas.allocation_size, queue_id, _mas.stream_handle,
event_primary_key);
# endif
};
};
}
#endif
postprocessing_callback
rocpd_post_processing::get_region_callback() const
{
[[maybe_unused]] auto parse_args = []([[maybe_unused]] const std::string& arg_str) {
#if ROCPROFSYS_USE_ROCM > 0
rocprofiler_sdk::function_args_t args;
const std::string delimiter = ";;";
auto split = [](const std::string& str, const std::string& _delimiter) {
std::vector<std::string> tokens;
size_t start = 0;
size_t end = str.find(_delimiter);
while(end != std::string::npos)
{
tokens.push_back(str.substr(start, end - start));
start = end + _delimiter.length();
end = str.find(_delimiter, start);
}
return tokens;
};
if(arg_str.empty())
{
return args;
}
auto tokens = split(arg_str, delimiter);
// Ensure the number of tokens is a multiple of 4
if(tokens.size() % 4 != 0)
{
throw std::invalid_argument("Malformed argument string.");
}
for(auto it = tokens.begin(); it != tokens.end(); it += 4)
{
rocprofiler_sdk::argument_info arg = { static_cast<uint32_t>(std::stoi(*it)),
*(it + 1), *(it + 2), *(it + 3) };
args.push_back(arg);
}
return args;
#endif
};
return [&]([[maybe_unused]] const storage_parsed_type_base& parsed) {
#if ROCPROFSYS_USE_ROCM > 0
auto _rs = static_cast<const struct region_sample&>(parsed);
auto data_processor = get_data_processor();
auto& n_info = node_info::get_instance();
auto process = m_metadata.get_process_info();
auto thread_primary_key =
data_processor->map_thread_id_to_primary_key(_rs.thread_id);
auto name_primary_key = data_processor->insert_string(_rs.name.c_str());
auto category_primary_key = data_processor->insert_string(_rs.category.c_str());
size_t stack_id = _rs.correlation_id_internal;
size_t parent_stack_id = _rs.correlation_id_ancestor;
size_t correlation_id = 0;
auto event_primary_key =
data_processor->insert_event(category_primary_key, stack_id, parent_stack_id,
correlation_id, _rs.call_stack.c_str());
auto args = parse_args(_rs.args_str);
for(const auto& arg : args)
{
data_processor->insert_args(event_primary_key, arg.arg_number,
arg.arg_type.c_str(), arg.arg_name.c_str(),
arg.arg_value.c_str());
}
data_processor->insert_region(n_info.id, process.pid, thread_primary_key,
_rs.start_timestamp, _rs.end_timestamp,
name_primary_key, event_primary_key);
#endif
};
}
postprocessing_callback
rocpd_post_processing::get_backtrace_sample_callback() const
{
return [&](const storage_parsed_type_base& parsed) {
auto _bts = static_cast<const struct backtrace_region_sample&>(parsed);
auto data_processor = get_data_processor();
auto& n_info = node_info::get_instance();
auto process = m_metadata.get_process_info();
auto thread_primary_key =
data_processor->map_thread_id_to_primary_key(_bts.thread_id);
auto name_primary_key = data_processor->insert_string(_bts.name.c_str());
auto category_primary_key = data_processor->insert_string(_bts.category.c_str());
auto event_primary_key = data_processor->insert_event(
category_primary_key, 0, 0, 0, _bts.call_stack.c_str(),
_bts.line_info.c_str(), _bts.extdata.c_str());
data_processor->insert_region(n_info.id, process.pid, thread_primary_key,
_bts.start_timestamp, _bts.end_timestamp,
name_primary_key, event_primary_key);
data_processor->insert_sample(_bts.track_name.c_str(), _bts.start_timestamp,
event_primary_key);
};
}
postprocessing_callback
rocpd_post_processing::get_in_time_sample_callback() const
{
return [&](const storage_parsed_type_base& parsed) {
auto _its = static_cast<const struct in_time_sample&>(parsed);
auto data_processor = get_data_processor();
auto track_primary_key = data_processor->insert_string(_its.track_name.c_str());
auto event_id = data_processor->insert_event(
track_primary_key, _its.stack_id, _its.parent_stack_id, _its.correlation_id,
_its.call_stack.c_str(), _its.line_info.c_str(), _its.event_metadata.c_str());
data_processor->insert_sample(_its.track_name.c_str(), _its.timestamp_ns,
event_id, "{}");
};
}
postprocessing_callback
rocpd_post_processing::get_pmc_event_with_sample_callback() const
{
return [&](const storage_parsed_type_base& parsed) {
auto _pmc = static_cast<const struct pmc_event_with_sample&>(parsed);
auto data_processor = get_data_processor();
auto track_primary_key = data_processor->insert_string(_pmc.track_name.c_str());
auto agent_primary_key =
m_agent_manager
.get_agent_by_id(_pmc.device_id,
static_cast<agent_type>(_pmc.device_type))
.base_id;
auto event_id = data_processor->insert_event(
track_primary_key, _pmc.stack_id, _pmc.parent_stack_id, _pmc.correlation_id,
_pmc.call_stack.c_str(), _pmc.line_info.c_str(), _pmc.event_metadata.c_str());
data_processor->insert_sample(_pmc.track_name.c_str(), _pmc.timestamp_ns,
event_id, "{}");
data_processor->insert_pmc_event(event_id, agent_primary_key,
_pmc.pmc_info_name.c_str(), _pmc.value);
};
}
postprocessing_callback
rocpd_post_processing::get_amd_smi_sample_callback() const
{
// Use the shared gpu_metrics_t from core/gpu_metrics.hpp
using gpu_metrics_t = gpu::gpu_metrics_t;
return [&](const storage_parsed_type_base& parsed) {
auto _amd_smi = static_cast<const struct amd_smi_sample&>(parsed);
auto data_processor = get_data_processor();
const auto* _name = trait::name<category::amd_smi>::value;
auto name_primary_key = data_processor->insert_string(_name);
auto event_id = data_processor->insert_event(name_primary_key, 0, 0, 0);
auto base_id =
m_agent_manager.get_agent_by_type_index(_amd_smi.device_id, agent_type::GPU)
.base_id;
auto insert_event_and_sample = [&](bool enabled, const char* pmc_name,
const char* track_name, double value) {
if(!enabled) return;
data_processor->insert_pmc_event(event_id, base_id, pmc_name, value);
data_processor->insert_sample(track_name, _amd_smi.timestamp, event_id);
};
using pos = trace_cache::amd_smi_sample::settings_positions;
std::bitset<8> settings_bits(_amd_smi.settings);
bool is_busy_enabled = settings_bits.test(static_cast<int>(pos::busy));
bool is_temp_enabled = settings_bits.test(static_cast<int>(pos::temp));
bool is_power_enabled = settings_bits.test(static_cast<int>(pos::power));
bool is_mem_usage_enabled = settings_bits.test(static_cast<int>(pos::mem_usage));
bool is_vcn_enabled = settings_bits.test(static_cast<int>(pos::vcn_activity));
bool is_jpeg_enabled = settings_bits.test(static_cast<int>(pos::jpeg_activity));
bool is_xgmi_enabled = settings_bits.test(static_cast<int>(pos::xgmi));
bool is_pcie_enabled = settings_bits.test(static_cast<int>(pos::pcie));
insert_event_and_sample(
is_busy_enabled, trait::name<category::amd_smi_gfx_busy>::value,
info::annotate_with_device_id<category::amd_smi_gfx_busy>(_amd_smi.device_id)
.c_str(),
_amd_smi.gfx_activity);
insert_event_and_sample(
is_busy_enabled, trait::name<category::amd_smi_umc_busy>::value,
info::annotate_with_device_id<category::amd_smi_umc_busy>(_amd_smi.device_id)
.c_str(),
_amd_smi.umc_activity);
insert_event_and_sample(
is_busy_enabled, trait::name<category::amd_smi_mm_busy>::value,
info::annotate_with_device_id<category::amd_smi_mm_busy>(_amd_smi.device_id)
.c_str(),
_amd_smi.mm_activity);
insert_event_and_sample(
is_temp_enabled, trait::name<category::amd_smi_temp>::value,
info::annotate_with_device_id<category::amd_smi_temp>(_amd_smi.device_id)
.c_str(),
_amd_smi.temperature);
insert_event_and_sample(
is_power_enabled, trait::name<category::amd_smi_power>::value,
info::annotate_with_device_id<category::amd_smi_power>(_amd_smi.device_id)
.c_str(),
_amd_smi.power);
insert_event_and_sample(
is_mem_usage_enabled, trait::name<category::amd_smi_memory_usage>::value,
info::annotate_with_device_id<category::amd_smi_memory_usage>(
_amd_smi.device_id)
.c_str(),
_amd_smi.mem_usage);
if(!is_vcn_enabled && !is_jpeg_enabled && !is_xgmi_enabled && !is_pcie_enabled)
return;
gpu_metrics_t gpu_metrics;
gpu::gpu_metrics_capabilities_t capabilities;
gpu::deserialize_gpu_metrics(_amd_smi.gpu_activity, gpu_metrics, is_vcn_enabled,
is_jpeg_enabled, is_xgmi_enabled, is_pcie_enabled,
capabilities);
// Insert VCN and JPEG activity metrics
auto insert_decode_vector_metrics = [&](auto category, bool _is_enabled,
const std::vector<uint16_t>& data,
std::optional<size_t> _idx =
std::nullopt) {
if(!_is_enabled) return;
using Category = std::decay_t<decltype(category)>;
for(size_t i = 0; i < data.size(); ++i)
{
const auto value = data[i];
if(value == std::numeric_limits<uint16_t>::max()) continue;
auto pmc_name = info::annotate_category<Category>(_idx, i);
auto track_name =
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
static_cast<double>(value));
}
};
// Insert XGMI read/write data metrics
auto insert_xgmi_vector_metrics = [&](auto category, bool _is_enabled,
const std::vector<uint64_t>& data,
std::optional<size_t> _idx = std::nullopt) {
if(!_is_enabled) return;
using Category = std::decay_t<decltype(category)>;
for(size_t i = 0; i < data.size(); ++i)
{
const auto value = data[i];
if(value == std::numeric_limits<uint64_t>::max()) continue;
auto pmc_name = info::annotate_category<Category>(_idx, i);
auto track_name =
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
static_cast<double>(value));
}
};
// Insert VCN activity metrics
if(capabilities.flags.vcn_is_device_level_only)
{
// Device-level: use vcn_activity vector
insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled,
gpu_metrics.vcn_activity, std::nullopt);
}
else
{
// Per-XCP: iterate through actual XCPs in vcn_busy
for(size_t xcp = 0; xcp < gpu_metrics.vcn_busy.size(); ++xcp)
{
insert_decode_vector_metrics(category::amd_smi_vcn_activity{},
is_vcn_enabled, gpu_metrics.vcn_busy[xcp],
xcp);
}
}
// Insert JPEG activity metrics
if(capabilities.flags.jpeg_is_device_level_only)
{
// Device-level: use jpeg_activity vector
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{},
is_jpeg_enabled, gpu_metrics.jpeg_activity,
std::nullopt);
}
else
{
// Per-XCP: iterate through actual XCPs in jpeg_busy
for(size_t xcp = 0; xcp < gpu_metrics.jpeg_busy.size(); ++xcp)
{
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{},
is_jpeg_enabled, gpu_metrics.jpeg_busy[xcp],
xcp);
}
}
// Insert XGMI metrics (scalar values)
insert_event_and_sample(
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_width>::value,
info::annotate_with_device_id<category::amd_smi_xgmi_link_width>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.xgmi_link_width);
insert_event_and_sample(
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_speed>::value,
info::annotate_with_device_id<category::amd_smi_xgmi_link_speed>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.xgmi_link_speed);
insert_xgmi_vector_metrics(category::amd_smi_xgmi_read_data{}, is_xgmi_enabled,
gpu_metrics.xgmi_read_data_acc, std::nullopt);
insert_xgmi_vector_metrics(category::amd_smi_xgmi_write_data{}, is_xgmi_enabled,
gpu_metrics.xgmi_write_data_acc, std::nullopt);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_width>::value,
info::annotate_with_device_id<category::amd_smi_pcie_link_width>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.pcie_link_width);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_speed>::value,
info::annotate_with_device_id<category::amd_smi_pcie_link_speed>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.pcie_link_speed);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_acc>::value,
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_acc>(
_amd_smi.device_id)
.c_str(),
static_cast<double>(gpu_metrics.pcie_bandwidth_acc));
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_inst>::value,
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_inst>(
_amd_smi.device_id)
.c_str(),
static_cast<double>(gpu_metrics.pcie_bandwidth_inst));
};
}
postprocessing_callback
rocpd_post_processing::get_cpu_freq_sample_callback() const
{
struct core_freq_sample
{
size_t id;
float value;
};
auto deserialize_freqs = [](std::vector<uint8_t>& buffer) {
std::vector<core_freq_sample> result;
size_t offset = 0;
while(offset + sizeof(float) + sizeof(size_t) <= buffer.size())
{
core_freq_sample core_sample;
std::memcpy(&core_sample.id, buffer.data() + offset, sizeof(size_t));
offset += sizeof(size_t);
std::memcpy(&core_sample.value, buffer.data() + offset, sizeof(float));
offset += sizeof(float);
result.push_back(core_sample);
}
return result;
};
return [&](const storage_parsed_type_base& parsed) {
auto _cpu_freq_sample = static_cast<const struct cpu_freq_sample&>(parsed);
auto data_processor = get_data_processor();
const auto* _name = trait::name<category::cpu_freq>::value;
auto name_primary_key = data_processor->insert_string(_name);
auto event_id = data_processor->insert_event(name_primary_key, 0, 0, 0);
auto device_id = 0;
auto base_id =
m_agent_manager.get_agent_by_type_index(device_id, agent_type::CPU).base_id;
auto insert_event_and_sample = [&](const char* name, double value) {
data_processor->insert_pmc_event(event_id, base_id, name, value);
data_processor->insert_sample(name, _cpu_freq_sample.timestamp, event_id);
};
insert_event_and_sample(trait::name<category::process_page>::value,
_cpu_freq_sample.page_rss);
insert_event_and_sample(trait::name<category::process_virt>::value,
_cpu_freq_sample.virt_mem_usage);
insert_event_and_sample(trait::name<category::process_peak>::value,
_cpu_freq_sample.peak_rss);
insert_event_and_sample(trait::name<category::process_context_switch>::value,
_cpu_freq_sample.context_switch_count);
insert_event_and_sample(trait::name<category::process_page_fault>::value,
_cpu_freq_sample.page_faults);
insert_event_and_sample(trait::name<category::process_user_mode_time>::value,
_cpu_freq_sample.user_mode_time);
insert_event_and_sample(trait::name<category::process_kernel_mode_time>::value,
_cpu_freq_sample.kernel_mode_time);
auto get_track_name = [](const auto& cpu_id) {
return std::string(trait::name<category::cpu_freq>::value) + " [" +
std::to_string(cpu_id) + "]";
};
auto core_freq_samples = deserialize_freqs(_cpu_freq_sample.freqs);
for(const auto& core : core_freq_samples)
{
insert_event_and_sample(get_track_name(core.id).c_str(), core.value);
}
};
}
rocpd_post_processing::rocpd_post_processing(metadata_registry& md,
agent_manager& agent_mngr, int pid, int ppid)
: m_metadata(md)
, m_agent_manager(agent_mngr)
, m_data_processor(std::make_shared<rocpd::data_processor>(
std::make_shared<rocpd::data_storage::database>(pid, ppid)))
{}
void
rocpd_post_processing::register_parser_callback([[maybe_unused]] storage_parser& parser)
{
#if ROCPROFSYS_USE_ROCM > 0
if(!get_use_rocpd())
{
return;
}
parser.register_type_callback(entry_type::region, get_region_callback());
parser.register_type_callback(entry_type::kernel_dispatch,
get_kernel_dispatch_callback());
parser.register_type_callback(entry_type::memory_copy, get_memory_copy_callback());
# if(ROCPROFILER_VERSION >= 600)
parser.register_type_callback(entry_type::memory_alloc,
get_memory_allocate_callback());
# endif
parser.register_type_callback(entry_type::in_time_sample,
get_in_time_sample_callback());
parser.register_type_callback(entry_type::pmc_event_with_sample,
get_pmc_event_with_sample_callback());
parser.register_type_callback(entry_type::amd_smi_sample,
get_amd_smi_sample_callback());
parser.register_type_callback(entry_type::cpu_freq_sample,
get_cpu_freq_sample_callback());
parser.register_type_callback(entry_type::backtrace_region_sample,
get_backtrace_sample_callback());
ROCPROFSYS_DEBUG("Buffer parser callbacks are registered..\n");
parser.register_on_finished_callback(
std::make_unique<std::function<void()>>([this]() {
if(m_data_processor != nullptr)
{
m_data_processor->flush();
}
}));
#endif
}
void
rocpd_post_processing::post_process_metadata()
{
#if ROCPROFSYS_USE_ROCM > 0
if(!get_use_rocpd())
{
return;
}
ROCPROFSYS_DEBUG("Post processing metadata..\n");
auto data_processor = get_data_processor();
auto n_info = node_info::get_instance();
data_processor->insert_node_info(n_info.id, n_info.hash, n_info.machine_id.c_str(),
n_info.system_name.c_str(), n_info.node_name.c_str(),
n_info.release.c_str(), n_info.version.c_str(),
n_info.machine.c_str(), n_info.domain_name.c_str());
auto process_info = m_metadata.get_process_info();
data_processor->insert_process_info(n_info.id, process_info.ppid, process_info.pid, 0,
0, process_info.start, process_info.end,
process_info.command.c_str(), "{}");
const auto& agents = m_agent_manager.get_agents();
int counter = 0;
for(const auto& rocpd_agent : agents)
{
auto _base_id = data_processor->insert_agent(
n_info.id, process_info.pid,
((rocpd_agent->type == agent_type::GPU) ? "GPU" : "CPU"), counter++,
rocpd_agent->logical_node_id, rocpd_agent->logical_node_type_id,
rocpd_agent->device_id, rocpd_agent->name.c_str(),
rocpd_agent->model_name.c_str(), rocpd_agent->vendor_name.c_str(),
rocpd_agent->product_name.c_str(), "");
rocpd_agent->base_id = _base_id;
}
auto _string_list = m_metadata.get_string_list();
for(auto& _string : _string_list)
{
data_processor->insert_string(std::string(_string).c_str());
}
auto _thread_info_list = m_metadata.get_thread_info_list();
for(auto& t_info : _thread_info_list)
{
rocpd_insert_thread_id(t_info, n_info, process_info);
}
auto _track_info_list = m_metadata.get_track_info_list();
for(auto& track : _track_info_list)
{
auto thread_id =
track.thread_id.has_value()
? std::make_optional<size_t>(data_processor->map_thread_id_to_primary_key(
track.thread_id.value()))
: std::nullopt;
data_processor->insert_track(track.track_name.c_str(), n_info.id,
process_info.pid, thread_id);
}
auto _code_object_list = m_metadata.get_code_object_list();
for(const auto& code_object : _code_object_list)
{
auto dev_id =
m_agent_manager.get_agent_by_handle(get_handle_from_code_object(code_object))
.base_id;
const char* strg_type = "UNKNOWN";
switch(code_object.storage_type)
{
case ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE: strg_type = "FILE"; break;
case ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY: strg_type = "MEMORY"; break;
default: break;
}
data_processor->insert_code_object(code_object.code_object_id, n_info.id,
process_info.pid, dev_id, code_object.uri,
code_object.load_base, code_object.load_size,
code_object.load_delta, strg_type);
}
auto _kernel_symbols_list = m_metadata.get_kernel_symbol_list();
for(const auto& kernel_symbol : _kernel_symbols_list)
{
auto kernel_name = tim::demangle(kernel_symbol.kernel_name);
data_processor->insert_kernel_symbol(
kernel_symbol.kernel_id, n_info.id, process_info.pid,
kernel_symbol.code_object_id, kernel_symbol.kernel_name, kernel_name.c_str(),
kernel_symbol.kernel_object, kernel_symbol.kernarg_segment_size,
kernel_symbol.kernarg_segment_alignment, kernel_symbol.group_segment_size,
kernel_symbol.private_segment_size, kernel_symbol.sgpr_count,
kernel_symbol.arch_vgpr_count, kernel_symbol.accum_vgpr_count);
data_processor->insert_string(kernel_name.c_str());
}
auto _queue_list = m_metadata.get_queue_list();
for(const auto& queue_handle : _queue_list)
{
std::stringstream ss;
ss << "Queue " << queue_handle;
data_processor->insert_queue_info(queue_handle, n_info.id, process_info.pid,
ss.str().c_str());
}
auto _stream_list = m_metadata.get_stream_list();
for(const auto& stream_handle : _stream_list)
{
std::stringstream ss;
ss << "Stream " << stream_handle;
data_processor->insert_stream_info(stream_handle, n_info.id, process_info.pid,
ss.str().c_str());
}
auto buffer_info_list = m_metadata.get_buffer_name_info();
for(const auto& buffer_info : buffer_info_list)
{
for(const auto& item : buffer_info.items())
{
data_processor->insert_string(*item.second);
}
}
auto callback_info_list = m_metadata.get_callback_tracing_info();
for(const auto& cb_info : callback_info_list)
{
for(const auto& item : cb_info.items())
{
data_processor->insert_string(*item.second);
}
}
auto pmc_info_list = m_metadata.get_pmc_info_list();
for(const auto& pmc_info : pmc_info_list)
{
const auto agent_primary_key =
m_agent_manager
.get_agent_by_type_index(pmc_info.agent_type_index, pmc_info.type)
.base_id;
data_processor->insert_pmc_description(
n_info.id, process_info.pid, agent_primary_key, pmc_info.target_arch.c_str(),
pmc_info.event_code, pmc_info.instance_id, pmc_info.name.c_str(),
pmc_info.symbol.c_str(), pmc_info.description.c_str(),
pmc_info.long_description.c_str(), pmc_info.component.c_str(),
pmc_info.units.c_str(), pmc_info.value_type.c_str(), pmc_info.block.c_str(),
pmc_info.expression.c_str(), pmc_info.is_constant, pmc_info.is_derived);
}
#endif
}
inline void
rocpd_post_processing::rocpd_insert_thread_id(info::thread& t_info,
const node_info& n_info,
const info::process& process_info) const
{
const auto& extended_info = thread_info::get(t_info.thread_id, SystemTID);
if(extended_info.has_value())
{
t_info.start = extended_info->get_start();
t_info.end = extended_info->get_stop();
}
std::stringstream ss;
ss << "Thread " << t_info.thread_id;
get_data_processor()->insert_thread_info(n_info.id, process_info.ppid,
process_info.pid, t_info.thread_id,
ss.str().c_str(), t_info.start, t_info.end);
}
} // namespace trace_cache
} // namespace rocprofsys
@@ -0,0 +1,827 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "core/trace_cache/rocpd_processor.hpp"
#include "core/agent_manager.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/gpu_metrics.hpp"
#include "core/node_info.hpp"
#include "core/rocpd/data_processor.hpp"
#include "core/rocpd/data_storage/database.hpp"
#include "core/trace_cache/metadata_registry.hpp"
#include "core/trace_cache/sample_type.hpp"
#include "library/thread_info.hpp"
#include <cstdint>
#include <limits>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include <timemory/utility/demangle.hpp>
#if ROCPROFSYS_USE_ROCM > 0
# include "library/rocprofiler-sdk/fwd.hpp"
# include <rocprofiler-sdk/context.h>
# include <rocprofiler-sdk/version.h>
#endif
namespace rocprofsys
{
namespace trace_cache
{
namespace
{
#if ROCPROFSYS_USE_ROCM > 0
auto
get_handle_from_code_object(
const rocprofiler_callback_tracing_code_object_load_data_t& code_object)
{
# if(ROCPROFILER_VERSION >= 600)
return code_object.agent_id.handle;
# else
return code_object.rocp_agent.handle;
# endif
}
#endif
} // namespace
void
rocpd_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds)
{
#if ROCPROFSYS_USE_ROCM > 0
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
auto agent_primary_key =
m_agent_manager->get_agent_by_handle(_kds.agent_id_handle).base_id;
auto thread_primary_key =
m_data_processor->map_thread_id_to_primary_key(_kds.thread_id);
auto category_id = m_data_processor->insert_string(
trait::name<category::rocm_kernel_dispatch>::value);
auto kernel_symbol = m_metadata->get_kernel_symbol(_kds.kernel_id);
if(!kernel_symbol.has_value())
{
throw std::runtime_error("Kernel symbol is missing for kernel dispatch");
}
auto region_name_primary_key = m_data_processor->insert_string(
tim::demangle(kernel_symbol->kernel_name).c_str());
auto stack_id = _kds.correlation_id_internal;
auto parent_stack_id = _kds.correlation_id_ancestor;
auto correlation_id = 0;
auto event_id = m_data_processor->insert_event(category_id, stack_id, parent_stack_id,
correlation_id);
m_data_processor->insert_kernel_dispatch(
n_info.id, process.pid, thread_primary_key, agent_primary_key, _kds.kernel_id,
_kds.dispatch_id, _kds.queue_id_handle, _kds.stream_handle, _kds.start_timestamp,
_kds.end_timestamp, _kds.private_segment_size, _kds.group_segment_size,
_kds.workgroup_size_x, _kds.workgroup_size_y, _kds.workgroup_size_z,
_kds.grid_size_x, _kds.grid_size_y, _kds.grid_size_z, region_name_primary_key,
event_id);
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
{
#if ROCPROFSYS_USE_ROCM > 0
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
auto _name = std::string{ m_metadata->get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_mcs.kind),
static_cast<rocprofiler_tracing_operation_t>(_mcs.operation)) };
auto name_primary_key = m_data_processor->insert_string(_name.c_str());
auto category_primary_key =
m_data_processor->insert_string(trait::name<category::rocm_memory_copy>::value);
auto thread_primary_key =
m_data_processor->map_thread_id_to_primary_key(_mcs.thread_id);
auto dst_agent_primary_key =
m_agent_manager->get_agent_by_handle(_mcs.dst_agent_id_handle).base_id;
auto src_agent_primary_key =
m_agent_manager->get_agent_by_handle(_mcs.src_agent_id_handle).base_id;
auto stack_id = _mcs.correlation_id_internal;
auto parent_stack_id = _mcs.correlation_id_ancestor;
auto correlation_id = 0;
auto queue_id = 0;
auto event_primary_key = m_data_processor->insert_event(
category_primary_key, stack_id, parent_stack_id, correlation_id);
m_data_processor->insert_memory_copy(
n_info.id, process.pid, thread_primary_key, _mcs.start_timestamp,
_mcs.end_timestamp, name_primary_key, dst_agent_primary_key,
_mcs.dst_address_value, src_agent_primary_key, _mcs.src_address_value, _mcs.bytes,
queue_id, _mcs.stream_handle, name_primary_key, event_primary_key);
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
{
#if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600)
static auto memtype_to_db =
[](std::string_view memory_type) -> std::pair<std::string, std::string> {
constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" };
constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" };
constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" };
constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" };
std::string _type;
std::string _level;
if(memory_type.find(MEMORY_PREFIX) == 0)
{
_type = memory_type.substr(MEMORY_PREFIX.length());
if(_type.find(VMEM_PREFIX) == 0)
{
_type = _type.substr(VMEM_PREFIX.length());
_level = "VIRTUAL";
}
else
{
_level = "REAL";
}
}
else if(memory_type.find(SCRATCH_PREFIX) == 0)
{
_type = memory_type.substr(SCRATCH_PREFIX.length());
_level = "SCRATCH";
if(memory_type.find(ASYNC_PREFIX) == 0)
{
_type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM
}
}
if(_type == "ALLOCATE")
{
_type = "ALLOC";
}
return std::make_pair(_type, _level);
};
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
auto thread_primary_key =
m_data_processor->map_thread_id_to_primary_key(_mas.thread_id);
auto agent_primary_key = std::optional<uint64_t>{};
const auto invalid_context = ROCPROFILER_CONTEXT_NONE;
if(_mas.agent_id_handle != invalid_context.handle)
{
{
agent_primary_key =
m_agent_manager->get_agent_by_handle(_mas.agent_id_handle).base_id;
}
const auto* _name = m_metadata->get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_mas.kind),
static_cast<rocprofiler_tracing_operation_t>(_mas.operation));
auto [type, level] = memtype_to_db(_name);
auto stack_id = _mas.correlation_id_internal;
auto parent_stack_id = _mas.correlation_id_ancestor;
auto correlation_id = 0;
auto queue_id = 0;
auto category_primary_key = m_data_processor->insert_string(
trait::name<category::rocm_memory_allocate>::value);
auto event_primary_key = m_data_processor->insert_event(
category_primary_key, stack_id, parent_stack_id, correlation_id);
m_data_processor->insert_memory_alloc(
n_info.id, process.pid, thread_primary_key, agent_primary_key, type.c_str(),
level.c_str(), _mas.start_timestamp, _mas.end_timestamp, _mas.address_value,
_mas.allocation_size, queue_id, _mas.stream_handle, event_primary_key);
}
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const region_sample& _rs)
{
#if ROCPROFSYS_USE_ROCM > 0
static auto parse_args = []([[maybe_unused]] const std::string& arg_str) {
rocprofiler_sdk::function_args_t args;
const std::string delimiter = ";;";
auto split = [](const std::string& str, const std::string& _delimiter) {
std::vector<std::string> tokens;
size_t start = 0;
size_t end = str.find(_delimiter);
while(end != std::string::npos)
{
tokens.push_back(str.substr(start, end - start));
start = end + _delimiter.length();
end = str.find(_delimiter, start);
}
return tokens;
};
if(arg_str.empty())
{
return args;
}
auto tokens = split(arg_str, delimiter);
// Ensure the number of tokens is a multiple of 4
if(tokens.size() % 4 != 0)
{
throw std::invalid_argument("Malformed argument string.");
}
for(auto it = tokens.begin(); it != tokens.end(); it += 4)
{
rocprofiler_sdk::argument_info arg = { static_cast<uint32_t>(std::stoi(*it)),
*(it + 1), *(it + 2), *(it + 3) };
args.push_back(arg);
}
return args;
};
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
auto thread_primary_key =
m_data_processor->map_thread_id_to_primary_key(_rs.thread_id);
auto name_primary_key = m_data_processor->insert_string(_rs.name.c_str());
auto category_primary_key = m_data_processor->insert_string(_rs.category.c_str());
size_t stack_id = _rs.correlation_id_internal;
size_t parent_stack_id = _rs.correlation_id_ancestor;
size_t correlation_id = 0;
auto event_primary_key =
m_data_processor->insert_event(category_primary_key, stack_id, parent_stack_id,
correlation_id, _rs.call_stack.c_str());
auto args = parse_args(_rs.args_str);
for(const auto& arg : args)
{
m_data_processor->insert_args(event_primary_key, arg.arg_number,
arg.arg_type.c_str(), arg.arg_name.c_str(),
arg.arg_value.c_str());
}
m_data_processor->insert_region(n_info.id, process.pid, thread_primary_key,
_rs.start_timestamp, _rs.end_timestamp,
name_primary_key, event_primary_key);
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const backtrace_region_sample& _bts)
{
#if ROCPROFSYS_USE_ROCM > 0
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
auto thread_primary_key =
m_data_processor->map_thread_id_to_primary_key(_bts.thread_id);
auto name_primary_key = m_data_processor->insert_string(_bts.name.c_str());
auto category_primary_key = m_data_processor->insert_string(_bts.category.c_str());
auto event_primary_key = m_data_processor->insert_event(
category_primary_key, 0, 0, 0, _bts.call_stack.c_str(), _bts.line_info.c_str(),
_bts.extdata.c_str());
m_data_processor->insert_region(n_info.id, process.pid, thread_primary_key,
_bts.start_timestamp, _bts.end_timestamp,
name_primary_key, event_primary_key);
m_data_processor->insert_sample(_bts.track_name.c_str(), _bts.start_timestamp,
event_primary_key);
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const in_time_sample& _its)
{
#if ROCPROFSYS_USE_ROCM > 0
auto track_primary_key = m_data_processor->insert_string(_its.track_name.c_str());
auto event_id = m_data_processor->insert_event(
track_primary_key, _its.stack_id, _its.parent_stack_id, _its.correlation_id,
_its.call_stack.c_str(), _its.line_info.c_str(), _its.event_metadata.c_str());
m_data_processor->insert_sample(_its.track_name.c_str(), _its.timestamp_ns, event_id,
"{}");
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const pmc_event_with_sample& _pmc)
{
#if ROCPROFSYS_USE_ROCM > 0
auto track_primary_key = m_data_processor->insert_string(_pmc.track_name.c_str());
auto agent_primary_key =
m_agent_manager
->get_agent_by_id(_pmc.device_id, static_cast<agent_type>(_pmc.device_type))
.base_id;
auto event_id = m_data_processor->insert_event(
track_primary_key, _pmc.stack_id, _pmc.parent_stack_id, _pmc.correlation_id,
_pmc.call_stack.c_str(), _pmc.line_info.c_str(), _pmc.event_metadata.c_str());
m_data_processor->insert_sample(_pmc.track_name.c_str(), _pmc.timestamp_ns, event_id,
"{}");
m_data_processor->insert_pmc_event(event_id, agent_primary_key,
_pmc.pmc_info_name.c_str(), _pmc.value);
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi)
{
#if ROCPROFSYS_USE_ROCM > 0
const auto* _name = trait::name<category::amd_smi>::value;
auto name_primary_key = m_data_processor->insert_string(_name);
auto event_id = m_data_processor->insert_event(name_primary_key, 0, 0, 0);
auto base_id =
m_agent_manager->get_agent_by_type_index(_amd_smi.device_id, agent_type::GPU)
.base_id;
auto insert_event_and_sample = [&](bool enabled, const char* pmc_name,
const char* track_name, double value) {
if(!enabled) return;
m_data_processor->insert_pmc_event(event_id, base_id, pmc_name, value);
m_data_processor->insert_sample(track_name, _amd_smi.timestamp, event_id);
};
using pos = trace_cache::amd_smi_sample::settings_positions;
std::bitset<8> settings_bits(_amd_smi.settings);
bool is_busy_enabled = settings_bits.test(static_cast<int>(pos::busy));
bool is_temp_enabled = settings_bits.test(static_cast<int>(pos::temp));
bool is_power_enabled = settings_bits.test(static_cast<int>(pos::power));
bool is_mem_usage_enabled = settings_bits.test(static_cast<int>(pos::mem_usage));
bool is_vcn_enabled = settings_bits.test(static_cast<int>(pos::vcn_activity));
bool is_jpeg_enabled = settings_bits.test(static_cast<int>(pos::jpeg_activity));
bool is_xgmi_enabled = settings_bits.test(static_cast<int>(pos::xgmi));
bool is_pcie_enabled = settings_bits.test(static_cast<int>(pos::pcie));
insert_event_and_sample(
is_busy_enabled, trait::name<category::amd_smi_gfx_busy>::value,
info::annotate_with_device_id<category::amd_smi_gfx_busy>(_amd_smi.device_id)
.c_str(),
_amd_smi.gfx_activity);
insert_event_and_sample(
is_busy_enabled, trait::name<category::amd_smi_umc_busy>::value,
info::annotate_with_device_id<category::amd_smi_umc_busy>(_amd_smi.device_id)
.c_str(),
_amd_smi.umc_activity);
insert_event_and_sample(
is_busy_enabled, trait::name<category::amd_smi_mm_busy>::value,
info::annotate_with_device_id<category::amd_smi_mm_busy>(_amd_smi.device_id)
.c_str(),
_amd_smi.mm_activity);
insert_event_and_sample(
is_temp_enabled, trait::name<category::amd_smi_temp>::value,
info::annotate_with_device_id<category::amd_smi_temp>(_amd_smi.device_id).c_str(),
_amd_smi.temperature);
insert_event_and_sample(
is_power_enabled, trait::name<category::amd_smi_power>::value,
info::annotate_with_device_id<category::amd_smi_power>(_amd_smi.device_id)
.c_str(),
_amd_smi.power);
insert_event_and_sample(
is_mem_usage_enabled, trait::name<category::amd_smi_memory_usage>::value,
info::annotate_with_device_id<category::amd_smi_memory_usage>(_amd_smi.device_id)
.c_str(),
_amd_smi.mem_usage);
if(!is_vcn_enabled && !is_jpeg_enabled && !is_xgmi_enabled && !is_pcie_enabled)
return;
gpu::gpu_metrics_t gpu_metrics;
gpu::gpu_metrics_capabilities_t capabilities;
gpu::deserialize_gpu_metrics(_amd_smi.gpu_activity, gpu_metrics, is_vcn_enabled,
is_jpeg_enabled, is_xgmi_enabled, is_pcie_enabled,
capabilities);
// Insert VCN and JPEG activity metrics
auto insert_decode_vector_metrics = [&](auto category, bool _is_enabled,
const std::vector<uint16_t>& data,
std::optional<size_t> _idx = std::nullopt) {
if(!_is_enabled) return;
using Category = std::decay_t<decltype(category)>;
for(size_t i = 0; i < data.size(); ++i)
{
const auto value = data[i];
if(value == std::numeric_limits<uint16_t>::max()) continue;
auto pmc_name = info::annotate_category<Category>(_idx, i);
auto track_name =
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
static_cast<double>(value));
}
};
// Insert XGMI read/write data metrics
auto insert_xgmi_vector_metrics = [&](auto category, bool _is_enabled,
const std::vector<uint64_t>& data,
std::optional<size_t> _idx = std::nullopt) {
if(!_is_enabled) return;
using Category = std::decay_t<decltype(category)>;
for(size_t i = 0; i < data.size(); ++i)
{
const auto value = data[i];
if(value == std::numeric_limits<uint64_t>::max()) continue;
auto pmc_name = info::annotate_category<Category>(_idx, i);
auto track_name =
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
static_cast<double>(value));
}
};
// Insert VCN activity metrics
if(capabilities.flags.vcn_is_device_level_only)
{
// Device-level: use vcn_activity vector
insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled,
gpu_metrics.vcn_activity, std::nullopt);
}
else
{
// Per-XCP: iterate through actual XCPs in vcn_busy
for(size_t xcp = 0; xcp < gpu_metrics.vcn_busy.size(); ++xcp)
{
insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled,
gpu_metrics.vcn_busy[xcp], xcp);
}
}
// Insert JPEG activity metrics
if(capabilities.flags.jpeg_is_device_level_only)
{
// Device-level: use jpeg_activity vector
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{}, is_jpeg_enabled,
gpu_metrics.jpeg_activity, std::nullopt);
}
else
{
// Per-XCP: iterate through actual XCPs in jpeg_busy
for(size_t xcp = 0; xcp < gpu_metrics.jpeg_busy.size(); ++xcp)
{
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{},
is_jpeg_enabled, gpu_metrics.jpeg_busy[xcp],
xcp);
}
}
// Insert XGMI metrics (scalar values)
insert_event_and_sample(
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_width>::value,
info::annotate_with_device_id<category::amd_smi_xgmi_link_width>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.xgmi_link_width);
insert_event_and_sample(
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_speed>::value,
info::annotate_with_device_id<category::amd_smi_xgmi_link_speed>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.xgmi_link_speed);
insert_xgmi_vector_metrics(category::amd_smi_xgmi_read_data{}, is_xgmi_enabled,
gpu_metrics.xgmi_read_data_acc, std::nullopt);
insert_xgmi_vector_metrics(category::amd_smi_xgmi_write_data{}, is_xgmi_enabled,
gpu_metrics.xgmi_write_data_acc, std::nullopt);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_width>::value,
info::annotate_with_device_id<category::amd_smi_pcie_link_width>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.pcie_link_width);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_speed>::value,
info::annotate_with_device_id<category::amd_smi_pcie_link_speed>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.pcie_link_speed);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_acc>::value,
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_acc>(
_amd_smi.device_id)
.c_str(),
static_cast<double>(gpu_metrics.pcie_bandwidth_acc));
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_inst>::value,
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_inst>(
_amd_smi.device_id)
.c_str(),
static_cast<double>(gpu_metrics.pcie_bandwidth_inst));
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const cpu_freq_sample& _cpu_freq_sample)
{
#if ROCPROFSYS_USE_ROCM > 0
struct core_freq_sample
{
size_t id;
float value;
};
auto deserialize_freqs = [](const std::vector<uint8_t>& buffer) {
std::vector<core_freq_sample> result;
size_t offset = 0;
while(offset + sizeof(float) + sizeof(size_t) <= buffer.size())
{
core_freq_sample core_sample;
std::memcpy(&core_sample.id, buffer.data() + offset, sizeof(size_t));
offset += sizeof(size_t);
std::memcpy(&core_sample.value, buffer.data() + offset, sizeof(float));
offset += sizeof(float);
result.push_back(core_sample);
}
return result;
};
const auto* _name = trait::name<category::cpu_freq>::value;
auto name_primary_key = m_data_processor->insert_string(_name);
auto event_id = m_data_processor->insert_event(name_primary_key, 0, 0, 0);
auto device_id = 0;
auto base_id =
m_agent_manager->get_agent_by_type_index(device_id, agent_type::CPU).base_id;
auto insert_event_and_sample = [&](const char* name, double value) {
m_data_processor->insert_pmc_event(event_id, base_id, name, value);
m_data_processor->insert_sample(name, _cpu_freq_sample.timestamp, event_id);
};
insert_event_and_sample(trait::name<category::process_page>::value,
_cpu_freq_sample.page_rss);
insert_event_and_sample(trait::name<category::process_virt>::value,
_cpu_freq_sample.virt_mem_usage);
insert_event_and_sample(trait::name<category::process_peak>::value,
_cpu_freq_sample.peak_rss);
insert_event_and_sample(trait::name<category::process_context_switch>::value,
_cpu_freq_sample.context_switch_count);
insert_event_and_sample(trait::name<category::process_page_fault>::value,
_cpu_freq_sample.page_faults);
insert_event_and_sample(trait::name<category::process_user_mode_time>::value,
_cpu_freq_sample.user_mode_time);
insert_event_and_sample(trait::name<category::process_kernel_mode_time>::value,
_cpu_freq_sample.kernel_mode_time);
auto get_track_name = [](const auto& cpu_id) {
return std::string(trait::name<category::cpu_freq>::value) + " [" +
std::to_string(cpu_id) + "]";
};
auto core_freq_samples = deserialize_freqs(_cpu_freq_sample.freqs);
for(const auto& core : core_freq_samples)
{
insert_event_and_sample(get_track_name(core.id).c_str(), core.value);
}
#endif
}
rocpd_processor_t::rocpd_processor_t(const std::shared_ptr<metadata_registry>& md,
const std::shared_ptr<agent_manager>& agent_mngr,
int pid, int ppid)
: processor_t<rocpd_processor_t>()
, m_metadata(md)
, m_agent_manager(agent_mngr)
, m_data_processor(std::make_shared<rocpd::data_processor>(
std::make_shared<rocpd::data_storage::database>(pid, ppid)))
{}
void
rocpd_processor_t::prepare_for_processing()
{
post_process_metadata();
}
void
rocpd_processor_t::finalize_processing()
{
m_data_processor->flush();
}
void
rocpd_processor_t::post_process_metadata()
{
#if ROCPROFSYS_USE_ROCM > 0
if(!get_use_rocpd())
{
return;
}
ROCPROFSYS_DEBUG("Post processing metadata..\n");
auto n_info = node_info::get_instance();
m_data_processor->insert_node_info(
n_info.id, n_info.hash, n_info.machine_id.c_str(), n_info.system_name.c_str(),
n_info.node_name.c_str(), n_info.release.c_str(), n_info.version.c_str(),
n_info.machine.c_str(), n_info.domain_name.c_str());
auto process_info = m_metadata->get_process_info();
m_data_processor->insert_process_info(n_info.id, process_info.ppid, process_info.pid,
0, 0, process_info.start, process_info.end,
process_info.command.c_str(), "{}");
const auto& agents = m_agent_manager->get_agents();
int counter = 0;
for(const auto& rocpd_agent : agents)
{
auto _base_id = m_data_processor->insert_agent(
n_info.id, process_info.pid,
((rocpd_agent->type == agent_type::GPU) ? "GPU" : "CPU"), counter++,
rocpd_agent->logical_node_id, rocpd_agent->logical_node_type_id,
rocpd_agent->device_id, rocpd_agent->name.c_str(),
rocpd_agent->model_name.c_str(), rocpd_agent->vendor_name.c_str(),
rocpd_agent->product_name.c_str(), "");
rocpd_agent->base_id = _base_id;
}
auto _string_list = m_metadata->get_string_list();
for(auto& _string : _string_list)
{
m_data_processor->insert_string(std::string(_string).c_str());
}
auto _thread_info_list = m_metadata->get_thread_info_list();
for(auto& t_info : _thread_info_list)
{
insert_thread_id(t_info, n_info, process_info);
}
auto _track_info_list = m_metadata->get_track_info_list();
for(auto& track : _track_info_list)
{
auto thread_id = track.thread_id.has_value()
? std::make_optional<size_t>(
m_data_processor->map_thread_id_to_primary_key(
track.thread_id.value()))
: std::nullopt;
m_data_processor->insert_track(track.track_name.c_str(), n_info.id,
process_info.pid, thread_id);
}
auto _code_object_list = m_metadata->get_code_object_list();
for(const auto& code_object : _code_object_list)
{
auto dev_id =
m_agent_manager->get_agent_by_handle(get_handle_from_code_object(code_object))
.base_id;
const char* strg_type = "UNKNOWN";
switch(code_object.storage_type)
{
case ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE: strg_type = "FILE"; break;
case ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY: strg_type = "MEMORY"; break;
default: break;
}
m_data_processor->insert_code_object(code_object.code_object_id, n_info.id,
process_info.pid, dev_id, code_object.uri,
code_object.load_base, code_object.load_size,
code_object.load_delta, strg_type);
}
auto _kernel_symbols_list = m_metadata->get_kernel_symbol_list();
for(const auto& kernel_symbol : _kernel_symbols_list)
{
auto kernel_name = tim::demangle(kernel_symbol.kernel_name);
m_data_processor->insert_kernel_symbol(
kernel_symbol.kernel_id, n_info.id, process_info.pid,
kernel_symbol.code_object_id, kernel_symbol.kernel_name, kernel_name.c_str(),
kernel_symbol.kernel_object, kernel_symbol.kernarg_segment_size,
kernel_symbol.kernarg_segment_alignment, kernel_symbol.group_segment_size,
kernel_symbol.private_segment_size, kernel_symbol.sgpr_count,
kernel_symbol.arch_vgpr_count, kernel_symbol.accum_vgpr_count);
m_data_processor->insert_string(kernel_name.c_str());
}
auto _queue_list = m_metadata->get_queue_list();
for(const auto& queue_handle : _queue_list)
{
std::stringstream ss;
ss << "Queue " << queue_handle;
m_data_processor->insert_queue_info(queue_handle, n_info.id, process_info.pid,
ss.str().c_str());
}
auto _stream_list = m_metadata->get_stream_list();
for(const auto& stream_handle : _stream_list)
{
std::stringstream ss;
ss << "Stream " << stream_handle;
m_data_processor->insert_stream_info(stream_handle, n_info.id, process_info.pid,
ss.str().c_str());
}
auto buffer_info_list = m_metadata->get_buffer_name_info();
for(const auto& buffer_info : buffer_info_list)
{
for(const auto& item : buffer_info.items())
{
m_data_processor->insert_string(*item.second);
}
}
auto callback_info_list = m_metadata->get_callback_tracing_info();
for(const auto& cb_info : callback_info_list)
{
for(const auto& item : cb_info.items())
{
m_data_processor->insert_string(*item.second);
}
}
auto pmc_info_list = m_metadata->get_pmc_info_list();
for(const auto& pmc_info : pmc_info_list)
{
const auto agent_primary_key =
m_agent_manager
->get_agent_by_type_index(pmc_info.agent_type_index, pmc_info.type)
.base_id;
m_data_processor->insert_pmc_description(
n_info.id, process_info.pid, agent_primary_key, pmc_info.target_arch.c_str(),
pmc_info.event_code, pmc_info.instance_id, pmc_info.name.c_str(),
pmc_info.symbol.c_str(), pmc_info.description.c_str(),
pmc_info.long_description.c_str(), pmc_info.component.c_str(),
pmc_info.units.c_str(), pmc_info.value_type.c_str(), pmc_info.block.c_str(),
pmc_info.expression.c_str(), pmc_info.is_constant, pmc_info.is_derived);
}
#endif
}
inline void
rocpd_processor_t::insert_thread_id(info::thread& t_info, const node_info& n_info,
const info::process& process_info)
{
const auto& extended_info = thread_info::get(t_info.thread_id, SystemTID);
if(extended_info.has_value())
{
t_info.start = extended_info->get_start();
t_info.end = extended_info->get_stop();
}
std::stringstream ss;
ss << "Thread " << t_info.thread_id;
m_data_processor->insert_thread_info(n_info.id, process_info.ppid, process_info.pid,
t_info.thread_id, ss.str().c_str(), t_info.start,
t_info.end);
}
} // namespace trace_cache
} // namespace rocprofsys
@@ -25,44 +25,44 @@
#include "core/node_info.hpp"
#include "core/rocpd/data_processor.hpp"
#include "core/trace_cache/metadata_registry.hpp"
#include "core/trace_cache/storage_parser.hpp"
#include "core/trace_cache/sample_processor.hpp"
#include "trace_cache/sample_type.hpp"
namespace rocprofsys
{
namespace trace_cache
{
class rocpd_post_processing
class rocpd_processor_t : public processor_t<rocpd_processor_t>
{
public:
rocpd_post_processing(metadata_registry& metadata, agent_manager& agent_mngr, int pid,
int ppid);
rocpd_processor_t(const std::shared_ptr<metadata_registry>& metadata,
const std::shared_ptr<agent_manager>& agent_mngr, int pid,
int ppid);
void register_parser_callback(storage_parser& parser);
void post_process_metadata();
void prepare_for_processing();
void finalize_processing();
std::shared_ptr<rocpd::data_processor> get_data_processor() const;
void handle(const kernel_dispatch_sample& sample);
void handle(const memory_copy_sample& sample);
void handle(const memory_allocate_sample& sample);
void handle(const region_sample& sample);
void handle(const in_time_sample& sample);
void handle(const pmc_event_with_sample& sample);
void handle(const amd_smi_sample& sample);
void handle(const cpu_freq_sample& sample);
void handle(const backtrace_region_sample& sample);
private:
using primary_key = size_t;
inline void rocpd_insert_thread_id(info::thread& t_info, const node_info& n_info,
const info::process& process_info) const;
void post_process_metadata();
inline void insert_thread_id(info::thread& t_info, const node_info& n_info,
const info::process& process_info);
postprocessing_callback get_kernel_dispatch_callback() const;
postprocessing_callback get_memory_copy_callback() const;
#if(ROCPROFILER_VERSION >= 600)
postprocessing_callback get_memory_allocate_callback() const;
#endif
postprocessing_callback get_region_callback() const;
postprocessing_callback get_in_time_sample_callback() const;
postprocessing_callback get_pmc_event_with_sample_callback() const;
postprocessing_callback get_amd_smi_sample_callback() const;
postprocessing_callback get_cpu_freq_sample_callback() const;
postprocessing_callback get_backtrace_sample_callback() const;
metadata_registry& m_metadata;
agent_manager& m_agent_manager;
std::shared_ptr<metadata_registry> m_metadata;
std::shared_ptr<agent_manager> m_agent_manager;
std::shared_ptr<rocpd::data_processor> m_data_processor;
};
@@ -0,0 +1,310 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/trace_cache/cacheable.hpp"
#include "core/trace_cache/sample_type.hpp"
#if ROCPROFSYS_USE_ROCM > 0
# include <rocprofiler-sdk/version.h>
#endif
#include <vector>
namespace rocprofsys
{
namespace trace_cache
{
template <typename T>
struct processor_t
{
void handle(const kernel_dispatch_sample& sample)
{
static_cast<T*>(this)->handle(sample);
}
void handle(const memory_copy_sample& sample)
{
static_cast<T*>(this)->handle(sample);
}
#if(ROCPROFILER_VERSION >= 600)
void handle(const memory_allocate_sample& sample)
{
static_cast<T*>(this)->handle(sample);
}
#endif
void handle(const region_sample& sample) { static_cast<T*>(this)->handle(sample); }
void handle(const in_time_sample& sample) { static_cast<T*>(this)->handle(sample); }
void handle(const pmc_event_with_sample& sample)
{
static_cast<T*>(this)->handle(sample);
}
void handle(const amd_smi_sample& sample) { static_cast<T*>(this)->handle(sample); }
void handle(const cpu_freq_sample& sample) { static_cast<T*>(this)->handle(sample); }
void handle(const backtrace_region_sample& sample)
{
static_cast<T*>(this)->handle(sample);
}
void prepare_for_processing() { static_cast<T*>(this)->prepare_for_processing(); }
void finalize_processing() { static_cast<T*>(this)->finalize_processing(); }
protected:
~processor_t() = default;
};
struct processor_view_t
{
using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept;
using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept;
#if(ROCPROFILER_VERSION >= 600)
using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept;
#endif
using region_fn_t = void (*)(void*, const region_sample&) noexcept;
using in_time_sample_fn_t = void (*)(void*, const in_time_sample&) noexcept;
using pmc_event_fn_t = void (*)(void*, const pmc_event_with_sample&) noexcept;
using amd_smi_sample_fn_t = void (*)(void*, const amd_smi_sample&) noexcept;
using cpu_freq_sample_fn_t = void (*)(void*, const cpu_freq_sample&) noexcept;
using backtrace_region_fn_t = void (*)(void*,
const backtrace_region_sample&) noexcept;
using prepare_for_processing_fn_t = void (*)(void*) noexcept;
using finalize_processing_fn_t = void (*)(void*) noexcept;
struct vtable_t
{
kernel_dispatch_fn_t handle_kernel_dispatch;
memory_copy_fn_t handle_memory_copy;
#if(ROCPROFILER_VERSION >= 600)
memory_allocate_fn_t handle_memory_allocate;
#endif
region_fn_t handle_region;
in_time_sample_fn_t handle_in_time_sample;
pmc_event_fn_t handle_pmc_event;
amd_smi_sample_fn_t handle_amd_smi_sample;
cpu_freq_sample_fn_t handle_cpu_freq_sample;
backtrace_region_fn_t handle_backtrace_region;
prepare_for_processing_fn_t prepare_for_processing;
finalize_processing_fn_t finalize_processing;
};
template <typename T>
explicit processor_view_t(T& t) noexcept
: m_object{ std::addressof(t) }
, m_vtable{ std::addressof(get_vtable_for_type<T>()) }
{
static_assert(std::is_base_of<processor_t<T>, T>::value,
"Type must be derived from processor_t<T>");
}
processor_view_t(const processor_view_t&) noexcept = default;
processor_view_t(processor_view_t&&) noexcept = default;
processor_view_t& operator=(const processor_view_t&) noexcept = default;
processor_view_t& operator=(processor_view_t&&) noexcept = default;
ROCPROFSYS_INLINE void handle(const kernel_dispatch_sample& sample) const noexcept
{
m_vtable->handle_kernel_dispatch(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept
{
m_vtable->handle_memory_copy(m_object, sample);
}
#if(ROCPROFILER_VERSION >= 600)
ROCPROFSYS_INLINE void handle(const memory_allocate_sample& sample) const noexcept
{
m_vtable->handle_memory_allocate(m_object, sample);
}
#endif
ROCPROFSYS_INLINE void handle(const region_sample& sample) const noexcept
{
m_vtable->handle_region(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const in_time_sample& sample) const noexcept
{
m_vtable->handle_in_time_sample(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const pmc_event_with_sample& sample) const noexcept
{
m_vtable->handle_pmc_event(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const amd_smi_sample& sample) const noexcept
{
m_vtable->handle_amd_smi_sample(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const cpu_freq_sample& sample) const noexcept
{
m_vtable->handle_cpu_freq_sample(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const backtrace_region_sample& sample) const noexcept
{
m_vtable->handle_backtrace_region(m_object, sample);
}
ROCPROFSYS_INLINE void prepare_for_processing() const noexcept
{
m_vtable->prepare_for_processing(m_object);
}
ROCPROFSYS_INLINE void finalize_processing() const noexcept
{
m_vtable->finalize_processing(m_object);
}
private:
template <typename T>
static inline const vtable_t& get_vtable_for_type() noexcept
{
static const vtable_t vtable{
+[](void* obj, const kernel_dispatch_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const memory_copy_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
#if(ROCPROFILER_VERSION >= 600)
+[](void* obj, const memory_allocate_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
#endif
+[](void* obj, const region_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const in_time_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const pmc_event_with_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const amd_smi_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const cpu_freq_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const backtrace_region_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj) noexcept { static_cast<T*>(obj)->prepare_for_processing(); },
+[](void* obj) noexcept { static_cast<T*>(obj)->finalize_processing(); }
};
return vtable;
}
void* m_object;
const vtable_t* m_vtable;
};
struct sample_processor_t
{
void clear_handlers() { m_processor_view_list.clear(); }
template <typename T>
void add_handler(T& handler)
{
m_processor_view_list.emplace_back(handler);
}
template <typename SampleType>
ROCPROFSYS_INLINE void handle_sample(const SampleType& sample) const
{
for(const auto& view : m_processor_view_list)
view.handle(sample);
}
ROCPROFSYS_INLINE void prepare_for_processing() const noexcept
{
for(const auto& view : m_processor_view_list)
view.prepare_for_processing();
}
ROCPROFSYS_INLINE void finalize_processing() const noexcept
{
for(const auto& view : m_processor_view_list)
view.finalize_processing();
}
ROCPROFSYS_INLINE bool is_empty() const noexcept
{
return m_processor_view_list.empty();
}
ROCPROFSYS_INLINE void execute_sample_processing(
type_identifier_t type_identifier, const trace_cache::cacheable_t& sample) const
{
switch(type_identifier)
{
case type_identifier_t::region:
handle_sample(static_cast<const region_sample&>(sample));
break;
case type_identifier_t::kernel_dispatch:
handle_sample(static_cast<const kernel_dispatch_sample&>(sample));
break;
case type_identifier_t::memory_copy:
handle_sample(static_cast<const memory_copy_sample&>(sample));
break;
#if ROCPROFILER_VERSION >= 600
case type_identifier_t::memory_alloc:
handle_sample(static_cast<const memory_allocate_sample&>(sample));
break;
#endif
case type_identifier_t::in_time_sample:
handle_sample(static_cast<const in_time_sample&>(sample));
break;
case type_identifier_t::pmc_event_with_sample:
handle_sample(static_cast<const pmc_event_with_sample&>(sample));
break;
case type_identifier_t::amd_smi_sample:
handle_sample(static_cast<const amd_smi_sample&>(sample));
break;
case type_identifier_t::cpu_freq_sample:
handle_sample(static_cast<const cpu_freq_sample&>(sample));
break;
case type_identifier_t::backtrace_region_sample:
handle_sample(static_cast<const backtrace_region_sample&>(sample));
break;
default: throw std::runtime_error("Unsupported sample type");
}
}
private:
std::vector<processor_view_t> m_processor_view_list;
};
} // namespace trace_cache
} // namespace rocprofsys
@@ -21,6 +21,7 @@
// SOFTWARE.
#pragma once
#include "core/trace_cache/cacheable.hpp"
#include <cstdint>
#include <stdint.h>
#include <string>
@@ -28,36 +29,70 @@
#include <utility>
#include <vector>
#if ROCPROFSYS_USE_ROCM > 0
# include <rocprofiler-sdk/version.h>
#endif
namespace rocprofsys
{
namespace trace_cache
{
struct storage_parsed_type_base
{};
struct kernel_dispatch_sample : storage_parsed_type_base
enum class type_identifier_t : uint32_t
{
// Timing fields
in_time_sample = 0x0000,
pmc_event_with_sample = 0x0001,
region = 0x0002,
kernel_dispatch = 0x0003,
memory_copy = 0x0004,
memory_alloc = 0x0005,
amd_smi_sample = 0x0006,
cpu_freq_sample = 0x0007,
backtrace_region_sample = 0x0008,
fragmented_space = 0xFFFF
};
struct kernel_dispatch_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::kernel_dispatch;
kernel_dispatch_sample() = default;
kernel_dispatch_sample(uint64_t _start_timestamp, uint64_t _end_timestamp,
uint64_t _thread_id, uint64_t _agent_id_handle,
uint64_t _kernel_id, uint64_t _dispatch_id,
uint64_t _queue_id_handle, uint64_t _correlation_id_internal,
uint64_t _correlation_id_ancestor,
uint32_t _private_segment_size, uint32_t _group_segment_size,
uint32_t _workgroup_size_x, uint32_t _workgroup_size_y,
uint32_t _workgroup_size_z, uint32_t _grid_size_x,
uint32_t _grid_size_y, uint32_t _grid_size_z,
size_t _stream_handle)
: start_timestamp(_start_timestamp)
, end_timestamp(_end_timestamp)
, thread_id(_thread_id)
, agent_id_handle(_agent_id_handle)
, kernel_id(_kernel_id)
, dispatch_id(_dispatch_id)
, queue_id_handle(_queue_id_handle)
, correlation_id_internal(_correlation_id_internal)
, correlation_id_ancestor(_correlation_id_ancestor)
, private_segment_size(_private_segment_size)
, group_segment_size(_group_segment_size)
, workgroup_size_x(_workgroup_size_x)
, workgroup_size_y(_workgroup_size_y)
, workgroup_size_z(_workgroup_size_z)
, grid_size_x(_grid_size_x)
, grid_size_y(_grid_size_y)
, grid_size_z(_grid_size_z)
, stream_handle(_stream_handle)
{}
uint64_t start_timestamp;
uint64_t end_timestamp;
// Identification fields
uint64_t thread_id;
uint64_t agent_id_handle;
uint64_t kernel_id;
uint64_t dispatch_id;
uint64_t queue_id_handle;
// Correlation fields
uint64_t correlation_id_internal;
uint64_t correlation_id_ancestor;
// Dispatch configuration
uint32_t private_segment_size;
uint32_t group_segment_size;
uint32_t workgroup_size_x;
@@ -66,69 +101,206 @@ struct kernel_dispatch_sample : storage_parsed_type_base
uint32_t grid_size_x;
uint32_t grid_size_y;
uint32_t grid_size_z;
// Stream handle
size_t stream_handle;
size_t stream_handle;
};
struct memory_copy_sample : storage_parsed_type_base
template <>
inline void
serialize(uint8_t* buffer, const kernel_dispatch_sample& item)
{
// Timing fields
utility::store_value(
buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.kernel_id, item.dispatch_id, item.queue_id_handle,
item.correlation_id_internal, item.correlation_id_ancestor,
item.private_segment_size, item.group_segment_size, item.workgroup_size_x,
item.workgroup_size_y, item.workgroup_size_z, item.grid_size_x, item.grid_size_y,
item.grid_size_z, (uint64_t) item.stream_handle);
}
template <>
inline kernel_dispatch_sample
deserialize(uint8_t*& buffer)
{
kernel_dispatch_sample item;
uint64_t stream_handle;
utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.kernel_id, item.dispatch_id,
item.queue_id_handle, item.correlation_id_internal,
item.correlation_id_ancestor, item.private_segment_size,
item.group_segment_size, item.workgroup_size_x,
item.workgroup_size_y, item.workgroup_size_z, item.grid_size_x,
item.grid_size_y, item.grid_size_z, stream_handle);
item.stream_handle = stream_handle;
return item;
}
template <>
inline size_t
get_size(const kernel_dispatch_sample& item)
{
return utility::get_size(
item.start_timestamp, item.end_timestamp, item.thread_id, item.agent_id_handle,
item.kernel_id, item.dispatch_id, item.queue_id_handle,
item.correlation_id_internal, item.correlation_id_ancestor,
item.private_segment_size, item.group_segment_size, item.workgroup_size_x,
item.workgroup_size_y, item.workgroup_size_z, item.grid_size_x, item.grid_size_y,
item.grid_size_z, (uint64_t) item.stream_handle);
}
struct memory_copy_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy;
memory_copy_sample() = default;
memory_copy_sample(uint64_t _start_timestamp, uint64_t _end_timestamp,
uint64_t _thread_id, uint64_t _dst_agent_id_handle,
uint64_t _src_agent_id_handle, int32_t _kind, int32_t _operation,
uint64_t _bytes, uint64_t _correlation_id_internal,
uint64_t _correlation_id_ancestor, uint64_t _dst_address_value,
uint64_t _src_address_value, size_t _stream_handle)
: start_timestamp(_start_timestamp)
, end_timestamp(_end_timestamp)
, thread_id(_thread_id)
, dst_agent_id_handle(_dst_agent_id_handle)
, src_agent_id_handle(_src_agent_id_handle)
, kind(_kind)
, operation(_operation)
, bytes(_bytes)
, correlation_id_internal(_correlation_id_internal)
, correlation_id_ancestor(_correlation_id_ancestor)
, dst_address_value(_dst_address_value)
, src_address_value(_src_address_value)
, stream_handle(_stream_handle)
{}
uint64_t start_timestamp;
uint64_t end_timestamp;
// Identification fields
uint64_t thread_id;
uint64_t dst_agent_id_handle;
uint64_t src_agent_id_handle;
// Operation details
int32_t kind;
int32_t operation;
uint64_t bytes;
// Correlation fields
uint64_t correlation_id_internal;
uint64_t correlation_id_ancestor;
// Address fields (version dependent)
uint64_t dst_address_value;
uint64_t src_address_value;
// Stream handle
size_t stream_handle;
size_t stream_handle;
};
#if(ROCPROFILER_VERSION >= 600)
struct memory_allocate_sample : storage_parsed_type_base
template <>
inline void
serialize(uint8_t* buffer, const memory_copy_sample& item)
{
// Timing fields
utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.dst_agent_id_handle, item.src_agent_id_handle, item.kind,
item.operation, item.bytes, item.correlation_id_internal,
item.correlation_id_ancestor, item.dst_address_value,
item.src_address_value, (uint64_t) item.stream_handle);
}
template <>
inline memory_copy_sample
deserialize(uint8_t*& buffer)
{
memory_copy_sample item;
uint64_t stream_handle;
utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.dst_agent_id_handle, item.src_agent_id_handle, item.kind,
item.operation, item.bytes, item.correlation_id_internal,
item.correlation_id_ancestor, item.dst_address_value,
item.src_address_value, stream_handle);
item.stream_handle = stream_handle;
return item;
}
template <>
inline size_t
get_size(const memory_copy_sample& item)
{
return utility::get_size(
item.start_timestamp, item.end_timestamp, item.thread_id,
item.dst_agent_id_handle, item.src_agent_id_handle, item.kind, item.operation,
item.bytes, item.correlation_id_internal, item.correlation_id_ancestor,
item.dst_address_value, item.src_address_value, (uint64_t) item.stream_handle);
}
struct memory_allocate_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier = type_identifier_t::memory_alloc;
memory_allocate_sample() = default;
memory_allocate_sample(uint64_t _start_timestamp, uint64_t _end_timestamp,
uint64_t _thread_id, uint64_t _agent_id_handle, int32_t _kind,
int32_t _operation, uint64_t _allocation_size,
uint64_t _correlation_id_internal,
uint64_t _correlation_id_ancestor, uint64_t _address_value,
size_t _stream_handle)
: start_timestamp(_start_timestamp)
, end_timestamp(_end_timestamp)
, thread_id(_thread_id)
, agent_id_handle(_agent_id_handle)
, kind(_kind)
, operation(_operation)
, allocation_size(_allocation_size)
, correlation_id_internal(_correlation_id_internal)
, correlation_id_ancestor(_correlation_id_ancestor)
, address_value(_address_value)
, stream_handle(_stream_handle)
{}
uint64_t start_timestamp;
uint64_t end_timestamp;
// Identification fields
uint64_t thread_id;
uint64_t agent_id_handle;
// Operation details
int32_t kind;
int32_t operation;
uint64_t allocation_size;
// Correlation fields
uint64_t correlation_id_internal;
uint64_t correlation_id_ancestor;
// Address fields (version dependent)
uint64_t address_value;
// Stream handle
size_t stream_handle;
size_t stream_handle;
};
#endif
struct region_sample : storage_parsed_type_base
template <>
inline void
serialize(uint8_t* buffer, const memory_allocate_sample& item)
{
utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.kind, item.operation,
item.allocation_size, item.correlation_id_internal,
item.correlation_id_ancestor, item.address_value,
(uint64_t) item.stream_handle);
}
template <>
inline memory_allocate_sample
deserialize(uint8_t*& buffer)
{
memory_allocate_sample item;
uint64_t stream_handle;
utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.kind, item.operation,
item.allocation_size, item.correlation_id_internal,
item.correlation_id_ancestor, item.address_value, stream_handle);
item.stream_handle = stream_handle;
return item;
}
template <>
inline size_t
get_size(const memory_allocate_sample& item)
{
return utility::get_size(
item.start_timestamp, item.end_timestamp, item.thread_id, item.agent_id_handle,
item.kind, item.operation, item.allocation_size, item.correlation_id_internal,
item.correlation_id_ancestor, item.address_value, (uint64_t) item.stream_handle);
}
struct region_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier = type_identifier_t::region;
region_sample() = default;
region_sample(uint64_t _thread_id, std::string _name,
uint64_t _correlation_id_internal, uint64_t _correlation_id_ancestor,
@@ -147,20 +319,74 @@ struct region_sample : storage_parsed_type_base
uint64_t thread_id;
std::string name;
uint64_t correlation_id_internal;
uint64_t correlation_id_ancestor;
uint64_t start_timestamp;
uint64_t end_timestamp;
uint64_t correlation_id_internal;
uint64_t correlation_id_ancestor;
uint64_t start_timestamp;
uint64_t end_timestamp;
std::string call_stack;
std::string args_str;
std::string category;
};
struct in_time_sample : storage_parsed_type_base
template <>
inline void
serialize(uint8_t* buffer, const region_sample& item)
{
utility::store_value(
buffer, item.thread_id, std::string_view(item.name), item.correlation_id_internal,
item.correlation_id_ancestor, item.start_timestamp, item.end_timestamp,
std::string_view(item.call_stack), std::string_view(item.args_str),
std::string_view(item.category));
}
template <>
inline region_sample
deserialize(uint8_t*& buffer)
{
region_sample item;
std::string_view name_view, call_stack_view, args_str_view, category_view;
utility::parse_value(buffer, item.thread_id, name_view, item.correlation_id_internal,
item.correlation_id_ancestor, item.start_timestamp,
item.end_timestamp, call_stack_view, args_str_view,
category_view);
item.name = std::string(name_view);
item.call_stack = std::string(call_stack_view);
item.args_str = std::string(args_str_view);
item.category = std::string(category_view);
return item;
}
template <>
inline size_t
get_size(const region_sample& item)
{
return utility::get_size(
item.thread_id, std::string_view(item.name), item.correlation_id_internal,
item.correlation_id_ancestor, item.start_timestamp, item.end_timestamp,
std::string_view(item.call_stack), std::string_view(item.args_str),
std::string_view(item.category));
}
struct in_time_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::in_time_sample;
in_time_sample() = default;
in_time_sample(std::string _track_name, size_t _timestamp_ns,
std::string _event_metadata, size_t _stack_id, size_t _parent_stack_id,
size_t _correlation_id, std::string _call_stack,
std::string _line_info)
: track_name(std::move(_track_name))
, timestamp_ns(_timestamp_ns)
, event_metadata(std::move(_event_metadata))
, stack_id(_stack_id)
, parent_stack_id(_parent_stack_id)
, correlation_id(_correlation_id)
, call_stack(std::move(_call_stack))
, line_info(std::move(_line_info))
{}
std::string track_name;
size_t timestamp_ns;
std::string event_metadata;
@@ -171,16 +397,149 @@ struct in_time_sample : storage_parsed_type_base
std::string line_info;
};
template <>
inline void
serialize(uint8_t* buffer, const in_time_sample& item)
{
utility::store_value(
buffer, std::string_view(item.track_name), (uint64_t) item.timestamp_ns,
std::string_view(item.event_metadata), (uint64_t) item.stack_id,
(uint64_t) item.parent_stack_id, (uint64_t) item.correlation_id,
std::string_view(item.call_stack), std::string_view(item.line_info));
}
template <>
inline in_time_sample
deserialize(uint8_t*& buffer)
{
in_time_sample item;
std::string_view track_name_view, event_metadata_view, call_stack_view,
line_info_view;
uint64_t timestamp_ns, stack_id, parent_stack_id, correlation_id;
utility::parse_value(buffer, track_name_view, timestamp_ns, event_metadata_view,
stack_id, parent_stack_id, correlation_id, call_stack_view,
line_info_view);
item.track_name = std::string(track_name_view);
item.timestamp_ns = timestamp_ns;
item.event_metadata = std::string(event_metadata_view);
item.stack_id = stack_id;
item.parent_stack_id = parent_stack_id;
item.correlation_id = correlation_id;
item.call_stack = std::string(call_stack_view);
item.line_info = std::string(line_info_view);
return item;
}
template <>
inline size_t
get_size(const in_time_sample& item)
{
return utility::get_size(
std::string_view(item.track_name), (uint64_t) item.timestamp_ns,
std::string_view(item.event_metadata), (uint64_t) item.stack_id,
(uint64_t) item.parent_stack_id, (uint64_t) item.correlation_id,
std::string_view(item.call_stack), std::string_view(item.line_info));
}
struct pmc_event_with_sample : in_time_sample
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::pmc_event_with_sample;
pmc_event_with_sample() = default;
pmc_event_with_sample(std::string _track_name, size_t _timestamp_ns,
std::string _event_metadata, size_t _stack_id,
size_t _parent_stack_id, size_t _correlation_id,
std::string _call_stack, std::string _line_info,
uint32_t _device_id, uint8_t _device_type,
std::string _pmc_info_name, double _value)
: in_time_sample(std::move(_track_name), _timestamp_ns, std::move(_event_metadata),
_stack_id, _parent_stack_id, _correlation_id, std::move(_call_stack),
std::move(_line_info))
, device_id(_device_id)
, device_type(_device_type)
, pmc_info_name(std::move(_pmc_info_name))
, value(_value)
{}
uint32_t device_id;
uint8_t device_type;
std::string pmc_info_name;
double value;
};
struct amd_smi_sample : storage_parsed_type_base
template <>
inline void
serialize(uint8_t* buffer, const pmc_event_with_sample& item)
{
utility::store_value(
buffer, std::string_view(item.track_name), (uint64_t) item.timestamp_ns,
std::string_view(item.event_metadata), (uint64_t) item.stack_id,
(uint64_t) item.parent_stack_id, (uint64_t) item.correlation_id,
std::string_view(item.call_stack), std::string_view(item.line_info),
item.device_id, item.device_type, std::string_view(item.pmc_info_name),
item.value);
}
template <>
inline pmc_event_with_sample
deserialize(uint8_t*& buffer)
{
pmc_event_with_sample item;
std::string_view track_name_view, event_metadata_view, call_stack_view,
line_info_view, pmc_info_name_view;
uint64_t timestamp_ns, stack_id, parent_stack_id, correlation_id;
utility::parse_value(buffer, track_name_view, timestamp_ns, event_metadata_view,
stack_id, parent_stack_id, correlation_id, call_stack_view,
line_info_view, item.device_id, item.device_type,
pmc_info_name_view, item.value);
item.track_name = std::string(track_name_view);
item.timestamp_ns = timestamp_ns;
item.event_metadata = std::string(event_metadata_view);
item.stack_id = stack_id;
item.parent_stack_id = parent_stack_id;
item.correlation_id = correlation_id;
item.call_stack = std::string(call_stack_view);
item.line_info = std::string(line_info_view);
item.pmc_info_name = std::string(pmc_info_name_view);
return item;
}
template <>
inline size_t
get_size(const pmc_event_with_sample& item)
{
return utility::get_size(
std::string_view(item.track_name), (uint64_t) item.timestamp_ns,
std::string_view(item.event_metadata), (uint64_t) item.stack_id,
(uint64_t) item.parent_stack_id, (uint64_t) item.correlation_id,
std::string_view(item.call_stack), std::string_view(item.line_info),
item.device_id, item.device_type, std::string_view(item.pmc_info_name),
item.value);
}
struct amd_smi_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::amd_smi_sample;
amd_smi_sample() = default;
amd_smi_sample(uint64_t _settings, uint32_t _device_id, size_t _timestamp,
uint32_t _gfx_activity, uint32_t _umc_activity, uint32_t _mm_activity,
uint32_t _power, int64_t _temperature, size_t _mem_usage,
std::vector<uint8_t> _gpu_activity)
: settings(_settings)
, device_id(_device_id)
, timestamp(_timestamp)
, gfx_activity(_gfx_activity)
, umc_activity(_umc_activity)
, mm_activity(_mm_activity)
, power(_power)
, temperature(_temperature)
, mem_usage(_mem_usage)
, gpu_activity(std::move(_gpu_activity))
{}
enum class settings_positions : uint8_t
{
busy = 0,
@@ -205,8 +564,61 @@ struct amd_smi_sample : storage_parsed_type_base
std::vector<uint8_t> gpu_activity;
};
struct cpu_freq_sample : storage_parsed_type_base
template <>
inline void
serialize(uint8_t* buffer, const amd_smi_sample& item)
{
utility::store_value(buffer, item.settings, item.device_id, (uint64_t) item.timestamp,
item.gfx_activity, item.umc_activity, item.mm_activity,
item.power, item.temperature, (uint64_t) item.mem_usage,
item.gpu_activity);
}
template <>
inline amd_smi_sample
deserialize(uint8_t*& buffer)
{
amd_smi_sample item;
uint64_t timestamp, mem_usage;
utility::parse_value(buffer, item.settings, item.device_id, timestamp,
item.gfx_activity, item.umc_activity, item.mm_activity,
item.power, item.temperature, mem_usage, item.gpu_activity);
item.timestamp = timestamp;
item.mem_usage = mem_usage;
return item;
}
template <>
inline size_t
get_size(const amd_smi_sample& item)
{
return utility::get_size(item.settings, item.device_id, (uint64_t) item.timestamp,
item.gfx_activity, item.umc_activity, item.mm_activity,
item.power, item.temperature, (uint64_t) item.mem_usage,
item.gpu_activity);
}
struct cpu_freq_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::cpu_freq_sample;
cpu_freq_sample() = default;
cpu_freq_sample(size_t _timestamp, int64_t _page_rss, int64_t _virt_mem_usage,
int64_t _peak_rss, int64_t _context_switch_count,
int64_t _page_faults, int64_t _user_mode_time,
int64_t _kernel_mode_time, std::vector<uint8_t> _freqs)
: timestamp(_timestamp)
, page_rss(_page_rss)
, virt_mem_usage(_virt_mem_usage)
, peak_rss(_peak_rss)
, context_switch_count(_context_switch_count)
, page_faults(_page_faults)
, user_mode_time(_user_mode_time)
, kernel_mode_time(_kernel_mode_time)
, freqs(std::move(_freqs))
{}
size_t timestamp;
int64_t page_rss;
int64_t virt_mem_usage;
@@ -218,8 +630,44 @@ struct cpu_freq_sample : storage_parsed_type_base
std::vector<uint8_t> freqs;
};
struct backtrace_region_sample : storage_parsed_type_base
template <>
inline void
serialize(uint8_t* buffer, const cpu_freq_sample& item)
{
utility::store_value(buffer, (uint64_t) item.timestamp, item.page_rss,
item.virt_mem_usage, item.peak_rss, item.context_switch_count,
item.page_faults, item.user_mode_time, item.kernel_mode_time,
item.freqs);
}
template <>
inline cpu_freq_sample
deserialize(uint8_t*& buffer)
{
cpu_freq_sample item;
uint64_t timestamp;
utility::parse_value(buffer, timestamp, item.page_rss, item.virt_mem_usage,
item.peak_rss, item.context_switch_count, item.page_faults,
item.user_mode_time, item.kernel_mode_time, item.freqs);
item.timestamp = timestamp;
return item;
}
template <>
inline size_t
get_size(const cpu_freq_sample& item)
{
return utility::get_size((uint64_t) item.timestamp, item.page_rss,
item.virt_mem_usage, item.peak_rss,
item.context_switch_count, item.page_faults,
item.user_mode_time, item.kernel_mode_time, item.freqs);
}
struct backtrace_region_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::backtrace_region_sample;
backtrace_region_sample() = default;
backtrace_region_sample(uint32_t _type, uint64_t _thread_id, std::string _track_name,
std::string _name, uint64_t _start_timestamp,
@@ -242,30 +690,54 @@ struct backtrace_region_sample : storage_parsed_type_base
uint64_t thread_id;
std::string track_name;
std::string name;
uint64_t start_timestamp;
uint64_t end_timestamp;
uint64_t start_timestamp;
uint64_t end_timestamp;
std::string category;
std::string call_stack;
std::string line_info;
std::string extdata;
};
enum class entry_type : uint32_t
template <>
inline void
serialize(uint8_t* buffer, const backtrace_region_sample& item)
{
in_time_sample = 0x0000,
pmc_event_with_sample = 0x0001,
region = 0x0002,
kernel_dispatch = 0x0003,
memory_copy = 0x0004,
#if(ROCPROFSYS_USE_ROCM && ROCPROFILER_VERSION >= 600)
memory_alloc = 0x0005,
#endif
amd_smi_sample = 0x0006,
cpu_freq_sample = 0x0007,
backtrace_region_sample = 0x0008,
fragmented_space = 0xFFFF
};
utility::store_value(
buffer, item.type, item.thread_id, std::string_view(item.track_name),
std::string_view(item.name), item.start_timestamp, item.end_timestamp,
std::string_view(item.category), std::string_view(item.call_stack),
std::string_view(item.line_info), std::string_view(item.extdata));
}
template <>
inline backtrace_region_sample
deserialize(uint8_t*& buffer)
{
backtrace_region_sample item;
std::string_view track_name_view, name_view, category_view, call_stack_view,
line_info_view, extdata_view;
utility::parse_value(buffer, item.type, item.thread_id, track_name_view, name_view,
item.start_timestamp, item.end_timestamp, category_view,
call_stack_view, line_info_view, extdata_view);
item.track_name = std::string(track_name_view);
item.name = std::string(name_view);
item.category = std::string(category_view);
item.call_stack = std::string(call_stack_view);
item.line_info = std::string(line_info_view);
item.extdata = std::string(extdata_view);
return item;
}
template <>
inline size_t
get_size(const backtrace_region_sample& item)
{
return utility::get_size(
item.type, item.thread_id, std::string_view(item.track_name),
std::string_view(item.name), item.start_timestamp, item.end_timestamp,
std::string_view(item.category), std::string_view(item.call_stack),
std::string_view(item.line_info), std::string_view(item.extdata));
}
} // namespace trace_cache
} // namespace rocprofsys
@@ -1,274 +0,0 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "storage_parser.hpp"
#include "debug.hpp"
#include "trace_cache/sample_type.hpp"
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <functional>
#include <memory>
#include <sstream>
#include <string>
namespace rocprofsys
{
namespace trace_cache
{
storage_parser::storage_parser(std::string _filename)
: m_filename(std::move(_filename))
{}
void
storage_parser::register_type_callback(
const entry_type& type,
const std::function<void(const storage_parsed_type_base&)>& callback)
{
m_callbacks[type].push_back(callback);
}
void
storage_parser::register_on_finished_callback(
std::unique_ptr<std::function<void()>> callback)
{
m_on_finished_callback = std::move(callback);
}
void
storage_parser::consume_storage()
{
ROCPROFSYS_VERBOSE(0, "Consuming buffered storage with filename: %s\n",
m_filename.c_str());
std::ifstream ifs(m_filename, std::ios::binary);
if(!ifs)
{
std::stringstream ss;
ss << "Error opening file for reading: " << m_filename << "\n";
throw std::runtime_error(ss.str());
}
bool _parsing_needed = !m_callbacks.empty();
struct __attribute__((packed)) sample_header
{
entry_type type;
size_t sample_size;
};
sample_header header;
while(!ifs.eof() && _parsing_needed)
{
ifs.read(reinterpret_cast<char*>(&header), sizeof(header));
if(header.sample_size == 0 || ifs.eof())
{
continue;
}
std::vector<uint8_t> sample;
sample.reserve(header.sample_size);
ifs.read(reinterpret_cast<char*>(sample.data()), header.sample_size);
if(ifs.bad())
{
ROCPROFSYS_WARNING(1,
"Bad read while consuming buffered storage. Filename: %s. "
"Bytes read: %d\n",
m_filename.c_str(), static_cast<int>(ifs.tellg()));
continue;
}
switch(header.type)
{
case entry_type::kernel_dispatch:
{
kernel_dispatch_sample _kernel_dispatch_sample;
parse_data(sample.data(), _kernel_dispatch_sample.start_timestamp,
_kernel_dispatch_sample.end_timestamp,
_kernel_dispatch_sample.thread_id,
_kernel_dispatch_sample.agent_id_handle,
_kernel_dispatch_sample.kernel_id,
_kernel_dispatch_sample.dispatch_id,
_kernel_dispatch_sample.queue_id_handle,
_kernel_dispatch_sample.correlation_id_internal,
_kernel_dispatch_sample.correlation_id_ancestor,
_kernel_dispatch_sample.private_segment_size,
_kernel_dispatch_sample.group_segment_size,
_kernel_dispatch_sample.workgroup_size_x,
_kernel_dispatch_sample.workgroup_size_y,
_kernel_dispatch_sample.workgroup_size_z,
_kernel_dispatch_sample.grid_size_x,
_kernel_dispatch_sample.grid_size_y,
_kernel_dispatch_sample.grid_size_z,
_kernel_dispatch_sample.stream_handle);
invoke_callbacks(header.type, _kernel_dispatch_sample);
break;
}
case entry_type::memory_copy:
{
memory_copy_sample _memory_copy_sample;
parse_data(
sample.data(), _memory_copy_sample.start_timestamp,
_memory_copy_sample.end_timestamp, _memory_copy_sample.thread_id,
_memory_copy_sample.dst_agent_id_handle,
_memory_copy_sample.src_agent_id_handle, _memory_copy_sample.kind,
_memory_copy_sample.operation, _memory_copy_sample.bytes,
_memory_copy_sample.correlation_id_internal,
_memory_copy_sample.correlation_id_ancestor,
_memory_copy_sample.dst_address_value,
_memory_copy_sample.src_address_value,
_memory_copy_sample.stream_handle);
invoke_callbacks(header.type, _memory_copy_sample);
break;
}
#if(ROCPROFILER_VERSION >= 600)
case entry_type::memory_alloc:
{
memory_allocate_sample _memory_allocate_sample;
parse_data(sample.data(), _memory_allocate_sample.start_timestamp,
_memory_allocate_sample.end_timestamp,
_memory_allocate_sample.thread_id,
_memory_allocate_sample.agent_id_handle,
_memory_allocate_sample.kind,
_memory_allocate_sample.operation,
_memory_allocate_sample.allocation_size,
_memory_allocate_sample.correlation_id_internal,
_memory_allocate_sample.correlation_id_ancestor,
_memory_allocate_sample.address_value,
_memory_allocate_sample.stream_handle);
invoke_callbacks(header.type, _memory_allocate_sample);
break;
}
#endif
case entry_type::region:
{
region_sample _region_sample;
parse_data(sample.data(), _region_sample.thread_id, _region_sample.name,
_region_sample.correlation_id_internal,
_region_sample.correlation_id_ancestor,
_region_sample.start_timestamp, _region_sample.end_timestamp,
_region_sample.call_stack, _region_sample.args_str,
_region_sample.category);
invoke_callbacks(header.type, _region_sample);
break;
}
case entry_type::in_time_sample:
{
in_time_sample _in_time_sample;
parse_data(sample.data(), _in_time_sample.track_name,
_in_time_sample.timestamp_ns, _in_time_sample.event_metadata,
_in_time_sample.stack_id, _in_time_sample.parent_stack_id,
_in_time_sample.correlation_id, _in_time_sample.call_stack,
_in_time_sample.line_info);
invoke_callbacks(header.type, _in_time_sample);
break;
}
case entry_type::pmc_event_with_sample:
{
pmc_event_with_sample _pmc_event_with_sample;
parse_data(
sample.data(), _pmc_event_with_sample.track_name,
_pmc_event_with_sample.timestamp_ns,
_pmc_event_with_sample.event_metadata,
_pmc_event_with_sample.stack_id,
_pmc_event_with_sample.parent_stack_id,
_pmc_event_with_sample.correlation_id,
_pmc_event_with_sample.call_stack, _pmc_event_with_sample.line_info,
_pmc_event_with_sample.device_id, _pmc_event_with_sample.device_type,
_pmc_event_with_sample.pmc_info_name, _pmc_event_with_sample.value);
invoke_callbacks(header.type, _pmc_event_with_sample);
break;
}
case entry_type::amd_smi_sample:
{
amd_smi_sample _amd_smi_sample;
parse_data(sample.data(), _amd_smi_sample.settings,
_amd_smi_sample.device_id, _amd_smi_sample.timestamp,
_amd_smi_sample.gfx_activity, _amd_smi_sample.umc_activity,
_amd_smi_sample.mm_activity, _amd_smi_sample.power,
_amd_smi_sample.temperature, _amd_smi_sample.mem_usage,
_amd_smi_sample.gpu_activity);
invoke_callbacks(header.type, _amd_smi_sample);
break;
}
case entry_type::cpu_freq_sample:
{
cpu_freq_sample _cpu_freq_sample;
parse_data(sample.data(), _cpu_freq_sample.timestamp,
_cpu_freq_sample.page_rss, _cpu_freq_sample.virt_mem_usage,
_cpu_freq_sample.peak_rss,
_cpu_freq_sample.context_switch_count,
_cpu_freq_sample.page_faults, _cpu_freq_sample.user_mode_time,
_cpu_freq_sample.kernel_mode_time, _cpu_freq_sample.freqs);
invoke_callbacks(header.type, _cpu_freq_sample);
break;
}
case entry_type::backtrace_region_sample:
{
backtrace_region_sample _backtrace_region_sample;
parse_data(
sample.data(), _backtrace_region_sample.type,
_backtrace_region_sample.thread_id,
_backtrace_region_sample.track_name, _backtrace_region_sample.name,
_backtrace_region_sample.start_timestamp,
_backtrace_region_sample.end_timestamp,
_backtrace_region_sample.category,
_backtrace_region_sample.call_stack,
_backtrace_region_sample.line_info, _backtrace_region_sample.extdata);
invoke_callbacks(header.type, _backtrace_region_sample);
}
default: break;
}
}
ifs.close();
if(m_on_finished_callback != nullptr)
{
(*m_on_finished_callback)();
}
}
void
storage_parser::invoke_callbacks(entry_type type, const storage_parsed_type_base& parsed)
{
auto _callback_list = m_callbacks.find(type);
if(_callback_list == m_callbacks.end())
{
ROCPROFSYS_VERBOSE(1, "Callback not found for cache postprocessing\n");
return;
}
for(auto& cb : _callback_list->second)
{
cb(parsed);
}
}
} // namespace trace_cache
} // namespace rocprofsys
@@ -22,73 +22,142 @@
#pragma once
#include "buffer_storage.hpp"
#include "sample_type.hpp"
#include "common/defines.h"
#include "core/debug.hpp"
#include "core/trace_cache/cacheable.hpp"
#include "core/trace_cache/type_registry.hpp"
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <iterator>
#include <map>
#include <rocprofiler-systems/categories.h>
#include <stdint.h>
#include <memory>
#include <sstream>
#include <string>
#include <type_traits>
#include <vector>
namespace rocprofsys
{
namespace trace_cache
{
using postprocessing_callback = std::function<void(const storage_parsed_type_base&)>;
class cache_manager;
template <typename TypeIdentifierEnum, typename... SupportedTypes>
class storage_parser
{
public:
void register_type_callback(const entry_type& type,
const postprocessing_callback& callback);
static_assert(type_traits::is_enum_class_v<TypeIdentifierEnum>,
"TypeIdentifierEnum must be an enum class");
void consume_storage();
void register_on_finished_callback(std::unique_ptr<std::function<void()>> callback);
static_assert(sizeof...(SupportedTypes) != 0, "SupportedTypes must be non-empty");
public:
storage_parser(std::string _filename)
: m_filename(std::move(_filename))
{}
template <typename TypeProcessing>
void load(std::shared_ptr<TypeProcessing> _type_processing)
{
static_assert(
type_traits::has_execute_processing<TypeProcessing, TypeIdentifierEnum,
cacheable_t>::value,
"TypeProcessing must have member function "
"execute_sample_processing(TypeIdentifierEnum, const cacheable_t&)");
if(_type_processing == nullptr)
{
throw std::runtime_error("TypeProcessing is nullptr");
}
ROCPROFSYS_DEBUG("Consuming buffered storage with filename: %s\n",
m_filename.c_str());
std::ifstream ifs(m_filename, std::ios::binary);
if(!ifs.good())
{
std::stringstream ss;
ss << "Error opening file for reading: " << m_filename << "\n";
throw std::runtime_error(ss.str());
}
struct __attribute__((packed)) sample_header
{
TypeIdentifierEnum type;
size_t sample_size;
};
sample_header header;
std::vector<uint8_t> sample;
sample.reserve(4096);
size_t last_capacity = sample.capacity();
while(!ifs.eof())
{
if(!ifs.good())
{
ROCPROFSYS_WARNING(0,
"Stream not in good state, stopping parse. File: %s\n",
m_filename.c_str());
break;
}
ifs.read(reinterpret_cast<char*>(&header), sizeof(header));
if(header.sample_size == 0 || ifs.eof())
{
continue;
}
if(ROCPROFSYS_UNLIKELY(header.sample_size > last_capacity))
{
sample.reserve(header.sample_size);
last_capacity = sample.capacity();
}
sample.resize(header.sample_size);
ifs.read(reinterpret_cast<char*>(sample.data()), header.sample_size);
if(ifs.fail())
{
ROCPROFSYS_WARNING(1,
"Bad read while consuming buffered storage. Filename: "
"%s Bytes read: %d\n",
m_filename.c_str(), static_cast<int>(ifs.tellg()));
continue;
}
if(header.type == TypeIdentifierEnum::fragmented_space)
{
continue;
}
auto* data = sample.data();
auto sample_value = m_registry.get_type(header.type, data);
if(sample_value.has_value())
{
_type_processing->execute_sample_processing(
header.type, std::visit(
[](auto& arg) -> cacheable_t& {
return static_cast<cacheable_t&>(arg);
},
sample_value.value()));
}
else
{
ROCPROFSYS_DEBUG("Unsupported type detected. Skipping current sample.\n");
continue;
}
}
ifs.close();
ROCPROFSYS_DEBUG("File parsing finished. Removing %s from file system.\n",
m_filename.c_str());
std::remove(m_filename.c_str());
}
private:
friend class cache_manager;
storage_parser(std::string _filename);
template <typename T>
static void process_arg(const uint8_t*& data_pos, T& arg)
{
if constexpr(std::is_same_v<T, std::string>)
{
arg = std::string((const char*) data_pos);
data_pos += arg.size() + 1;
}
else if constexpr(std::is_same_v<T, std::vector<uint8_t>>)
{
size_t vector_size = *reinterpret_cast<const size_t*>(data_pos);
data_pos += sizeof(size_t);
arg.reserve(vector_size);
std::copy_n(data_pos, vector_size, std::back_inserter(arg));
data_pos += vector_size;
}
else
{
arg = *reinterpret_cast<const T*>(data_pos);
data_pos += sizeof(T);
}
}
template <typename... Args>
static void parse_data(const uint8_t* data_pos, Args&... args)
{
(process_arg(data_pos, args), ...);
}
void invoke_callbacks(entry_type type, const storage_parsed_type_base& parsed);
std::string m_filename;
std::map<entry_type, std::vector<postprocessing_callback>> m_callbacks;
std::unique_ptr<std::function<void()>> m_on_finished_callback{ nullptr };
std::string m_filename;
type_registry<TypeIdentifierEnum, SupportedTypes...> m_registry;
};
} // namespace trace_cache
@@ -21,37 +21,53 @@
// SOFTWARE.
#pragma once
#include "library/runtime.hpp"
#include "sample_type.hpp"
#include <array>
#include <string>
#include <timemory/units.hpp>
#include <unistd.h>
#include "core/trace_cache/cache_type_traits.hpp"
#include <functional>
#include <map>
#include <optional>
namespace rocprofsys
{
namespace trace_cache
{
constexpr size_t buffer_size = 100 * tim::units::megabyte;
constexpr size_t flush_threshold = 80 * tim::units::megabyte;
const auto tmp_directory = std::string{ "/tmp/" };
template <typename TypeIdentifierEnum, typename... SupportedTypes>
class type_registry
{
static_assert(type_traits::is_enum_class_v<TypeIdentifierEnum>,
"TypeIdentifierEnum must be an enum class");
const auto get_buffered_storage_filename = [](const int& ppid, const int& pid) {
return std::string{ tmp_directory + "buffered_storage_" + std::to_string(ppid) + "_" +
std::to_string(pid) + ".bin" };
public:
using variant_t = typename std::variant<SupportedTypes...>;
type_registry() { (register_type<SupportedTypes>(), ...); }
std::optional<variant_t> get_type(TypeIdentifierEnum id, uint8_t*& data)
{
auto it = deserializers.find(id);
if(it != deserializers.end())
{
return it->second(data);
}
return std::nullopt;
}
private:
std::map<TypeIdentifierEnum, std::function<variant_t(uint8_t*&)>> deserializers;
template <typename T>
inline void register_type()
{
static_assert(type_traits::has_type_identifier<T, TypeIdentifierEnum>::value,
"Type must have type_identifier");
static_assert(type_traits::has_deserialize<T>::value,
"Type must have deserialize function");
deserializers[T::type_identifier] = [](uint8_t*& data) -> variant_t {
return deserialize<T>(data);
};
}
};
const auto get_metadata_filepath = [](const int& ppid, const int& pid) {
return std::string{ tmp_directory + "metadata_" + std::to_string(ppid) + "_" +
std::to_string(pid) + ".json" };
};
constexpr size_t minimal_fragmented_memory_size = sizeof(entry_type) + sizeof(size_t);
using buffer_array_t = std::array<uint8_t, buffer_size>;
constexpr auto ABSOLUTE = "ABS";
constexpr auto PERCENTAGE = "%";
} // namespace trace_cache
} // namespace rocprofsys
@@ -45,7 +45,7 @@
#include "core/rocpd/data_processor.hpp"
#include "core/timemory.hpp"
#include "core/trace_cache/cache_manager.hpp"
#include "core/trace_cache/cache_utility.hpp"
#include "core/trace_cache/cacheable.hpp"
#include "core/trace_cache/metadata_registry.hpp"
#include "core/utility.hpp"
#include "library/causal/data.hpp"
@@ -578,7 +578,7 @@ rocprofsys_init_tooling_hidden(void)
ROCPROFSYS_DEBUG_F("State: %s -> State::Active\n",
std::to_string(get_state()).c_str());
trace_cache::get_buffer_storage().start_flushing_thread(getpid());
trace_cache::get_buffer_storage().start(getpid());
set_state(State::Active); // set to active as very last operation
} };
@@ -798,7 +798,7 @@ rocprofsys_finalize_hidden(void)
const auto _agents = get_agent_manager_instance().get_agents();
_manager.shutdown();
const auto metadata_filepath =
trace_cache::get_metadata_filepath(get_root_process_id(), getpid());
trace_cache::utility::get_metadata_filepath(get_root_process_id(), getpid());
_manager.get_metadata_registry().save_to_file(metadata_filepath, _agents);
std::quick_exit(EXIT_SUCCESS);
@@ -28,7 +28,7 @@
#include "core/agent.hpp"
#include "core/trace_cache/cache_manager.hpp"
#include "core/trace_cache/cache_utility.hpp"
#include "core/trace_cache/cacheable.hpp"
#include "core/trace_cache/sample_type.hpp"
#include <amd_smi/amdsmi.h>
#include <cstdint>
@@ -693,12 +693,11 @@ data::sample(uint32_t _device_id)
// Store samples if basic metrics are enabled OR if there's advanced metric data
if(_basic_metrics_enabled || has_data)
{
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::amd_smi_sample, serialize_settings(m_dev_id),
_device_id, _timestamp, m_busy_perc.gfx_activity,
m_busy_perc.umc_activity, m_busy_perc.mm_activity,
m_power.current_socket_power, m_temp, m_mem_usage,
serialize_gpu_metrics(m_dev_id, metrics, capabilities));
trace_cache::get_buffer_storage().store(trace_cache::amd_smi_sample{
serialize_settings(m_dev_id), _device_id, _timestamp,
m_busy_perc.gfx_activity, m_busy_perc.umc_activity,
m_busy_perc.mm_activity, m_power.current_socket_power, m_temp,
m_mem_usage, serialize_gpu_metrics(m_dev_id, metrics, capabilities) });
if(has_data) m_gpu_metrics.push_back(metrics);
}
@@ -27,7 +27,7 @@
#include "core/debug.hpp"
#include "core/perfetto.hpp"
#include "core/trace_cache/cache_manager.hpp"
#include "core/trace_cache/cache_utility.hpp"
#include "core/trace_cache/cacheable.hpp"
#include "core/trace_cache/metadata_registry.hpp"
#include "library/components/ensure_storage.hpp"
#include "library/ptl.hpp"
@@ -261,11 +261,10 @@ cache_backtrace_metrics_events(const uint32_t device_id, uint64_t timestamp_ns,
const auto* line_info = "";
auto insert_event_and_sample = [&](const char* _track_name, double _value) {
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::pmc_event_with_sample, _track_name, timestamp_ns,
event_metadata, stack_id, parent_stack_id, correlation_id, call_stack,
line_info, device_id, static_cast<uint8_t>(agent_type::CPU), _track_name,
_value);
trace_cache::get_buffer_storage().store(trace_cache::pmc_event_with_sample{
_track_name, timestamp_ns, event_metadata, stack_id, parent_stack_id,
correlation_id, call_stack, line_info, device_id,
static_cast<uint8_t>(agent_type::CPU), _track_name, _value });
};
if constexpr(std::is_same_v<Category, category::thread_hardware_counter>)
@@ -27,6 +27,7 @@
#include "core/state.hpp"
#include "core/timemory.hpp"
#include "core/trace_cache/cache_manager.hpp"
#include "core/trace_cache/sample_type.hpp"
#include "library/causal/data.hpp"
#include "library/runtime.hpp"
#include "library/thread_info.hpp"
@@ -55,9 +56,9 @@ cache_region(uint64_t thread_id, const std::string& name, uint64_t start_ts,
constexpr const char* CALLSTACK = "";
constexpr const char* ARGUMENTS = "";
rocprofsys::trace_cache::get_buffer_storage().store(
rocprofsys::trace_cache::entry_type::region, thread_id, name.c_str(),
NO_CORRELATION_ID, NO_CORRELATION_ID, start_ts, end_ts, CALLSTACK, ARGUMENTS,
category.c_str());
rocprofsys::trace_cache::region_sample{
thread_id, name.c_str(), NO_CORRELATION_ID, NO_CORRELATION_ID, start_ts,
end_ts, CALLSTACK, ARGUMENTS, category.c_str() });
}
struct entry_key
@@ -151,12 +151,11 @@ cache_comm_data_events(const uint32_t device_id, int bytes)
const std::string call_stack = "{}";
const std::string line_info = "{}";
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::pmc_event_with_sample, track_name.c_str(), timestamp_ns,
event_metadata.c_str(), stack_id, parent_stack_id, correlation_id,
call_stack.c_str(), line_info.c_str(), device_id,
trace_cache::get_buffer_storage().store(trace_cache::pmc_event_with_sample{
track_name.c_str(), timestamp_ns, event_metadata.c_str(), stack_id,
parent_stack_id, correlation_id, call_stack.c_str(), line_info.c_str(), device_id,
static_cast<uint8_t>(agent_type::CPU), track_name.c_str(),
static_cast<double>(value));
static_cast<double>(value) });
}
} // namespace
@@ -260,14 +260,13 @@ sample()
auto _freqs = component::cpu_freq{}.sample();
// user and kernel mode times are in microseconds
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::cpu_freq_sample, _timestamp, tim::get_page_rss(),
tim::get_virt_mem(), _rcache.get_peak_rss(),
trace_cache::get_buffer_storage().store(trace_cache::cpu_freq_sample{
_timestamp, tim::get_page_rss(), tim::get_virt_mem(), _rcache.get_peak_rss(),
_rcache.get_num_priority_context_switch() +
_rcache.get_num_voluntary_context_switch(),
_rcache.get_num_major_page_faults() + _rcache.get_num_minor_page_faults(),
_rcache.get_user_mode_time() * 1000, _rcache.get_kernel_mode_time() * 1000,
serialize_freqs(_freqs));
serialize_freqs(_freqs) });
data.emplace_back(
_timestamp, tim::get_page_rss(), tim::get_virt_mem(), _rcache.get_peak_rss(),
@@ -191,10 +191,10 @@ cache_kokkos_event(const char* name, const char* event_type, const char* target,
const char* line_info = "{}";
rocprofsys::trace_cache::get_buffer_storage().store(
rocprofsys::trace_cache::entry_type::in_time_sample,
rocprofsys::trait::name<category::kokkos>::value, timestamp_ns,
event_metadata.dump().c_str(), stack_id, parent_stack_id, correlation_id,
call_stack, line_info);
rocprofsys::trace_cache::in_time_sample{
rocprofsys::trait::name<category::kokkos>::value, timestamp_ns,
event_metadata.dump().c_str(), stack_id, parent_stack_id, correlation_id,
call_stack, line_info });
}
} // namespace
@@ -526,7 +526,6 @@ get_mem_alloc_address(
}
#endif
// clang-format off
void
cache_region(const rocprofiler_callback_tracing_record_t* record,
const rocprofiler_timestamp_t start_timestamp,
@@ -538,92 +537,62 @@ cache_region(const rocprofiler_callback_tracing_record_t* record,
trace_cache::get_metadata_registry().get_callback_tracing_info();
auto _name = std::string{ callback_tracing_info.at(record->kind, record->operation) };
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::region,
record->thread_id,
_name.c_str(),
record->correlation_id.internal,
get_parent_stack_id(record->correlation_id),
start_timestamp,
end_timestamp,
call_stack.c_str(),
args_str.c_str(),
category.c_str());
trace_cache::get_buffer_storage().store(trace_cache::region_sample{
record->thread_id, _name.c_str(), record->correlation_id.internal,
get_parent_stack_id(record->correlation_id), start_timestamp, end_timestamp,
call_stack.c_str(), args_str.c_str(), category.c_str() });
}
void
cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* record, uint64_t stream_handle)
cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* record,
uint64_t stream_handle)
{
auto queue_handle = record->dispatch_info.queue_id.handle;
auto queue_handle = record->dispatch_info.queue_id.handle;
trace_cache::get_metadata_registry().add_queue(queue_handle);
trace_cache::get_metadata_registry().add_stream(stream_handle);
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::kernel_dispatch,
record->start_timestamp,
record->end_timestamp,
record->thread_id,
record->dispatch_info.agent_id.handle,
record->dispatch_info.kernel_id,
record->dispatch_info.dispatch_id,
record->dispatch_info.queue_id.handle,
record->correlation_id.internal,
get_parent_stack_id(record->correlation_id),
trace_cache::get_buffer_storage().store(trace_cache::kernel_dispatch_sample{
record->start_timestamp, record->end_timestamp, record->thread_id,
record->dispatch_info.agent_id.handle, record->dispatch_info.kernel_id,
record->dispatch_info.dispatch_id, record->dispatch_info.queue_id.handle,
record->correlation_id.internal, get_parent_stack_id(record->correlation_id),
record->dispatch_info.private_segment_size,
record->dispatch_info.group_segment_size,
record->dispatch_info.workgroup_size.x,
record->dispatch_info.workgroup_size.y,
record->dispatch_info.workgroup_size.z,
record->dispatch_info.grid_size.x,
record->dispatch_info.grid_size.y,
record->dispatch_info.grid_size.z,
stream_handle);
record->dispatch_info.group_segment_size, record->dispatch_info.workgroup_size.x,
record->dispatch_info.workgroup_size.y, record->dispatch_info.workgroup_size.z,
record->dispatch_info.grid_size.x, record->dispatch_info.grid_size.y,
record->dispatch_info.grid_size.z, stream_handle });
}
void
cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record, uint64_t stream_handle)
cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record,
uint64_t stream_handle)
{
trace_cache::get_metadata_registry().add_stream(stream_handle);
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::memory_copy,
record->start_timestamp,
record->end_timestamp,
record->thread_id,
record->dst_agent_id.handle,
record->src_agent_id.handle,
static_cast<int32_t>(record->kind),
static_cast<int32_t>(record->operation),
record->bytes,
record->correlation_id.internal,
get_parent_stack_id(record->correlation_id),
get_mem_copy_dst_address(*record),
get_mem_copy_src_address(*record),
stream_handle);
trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{
record->start_timestamp, record->end_timestamp, record->thread_id,
record->dst_agent_id.handle, record->src_agent_id.handle,
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
record->bytes, record->correlation_id.internal,
get_parent_stack_id(record->correlation_id), get_mem_copy_dst_address(*record),
get_mem_copy_src_address(*record), stream_handle });
}
#if (ROCPROFILER_VERSION >= 600)
#if(ROCPROFILER_VERSION >= 600)
void
cache_memory_allocation(rocprofiler_buffer_tracing_memory_allocation_record_t* record, uint64_t stream_handle)
cache_memory_allocation(rocprofiler_buffer_tracing_memory_allocation_record_t* record,
uint64_t stream_handle)
{
trace_cache::get_metadata_registry().add_stream(stream_handle);
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::memory_alloc,
record->start_timestamp,
record->end_timestamp,
record->thread_id,
record->agent_id.handle,
static_cast<int32_t>(record->kind),
static_cast<int32_t>(record->operation),
record->allocation_size,
record->correlation_id.internal,
get_parent_stack_id(record->correlation_id),
get_mem_alloc_address(*record),
stream_handle);
trace_cache::get_buffer_storage().store(trace_cache::memory_allocate_sample{
record->start_timestamp, record->end_timestamp, record->thread_id,
record->agent_id.handle, static_cast<int32_t>(record->kind),
static_cast<int32_t>(record->operation), record->allocation_size,
record->correlation_id.internal, get_parent_stack_id(record->correlation_id),
get_mem_alloc_address(*record), stream_handle });
}
#endif
// clang-format on
std::string
get_args_string(const function_args_t& args)
@@ -120,12 +120,11 @@ counter_event::operator()(const client_data* tool_data, ::perfetto::CounterTrack
auto agent = get_agent_manager_instance().get_agent_by_handle(agent_handle);
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::pmc_event_with_sample, track_name.c_str(),
_timing.start, event_metadata.c_str(), stack_id, parent_stack_id,
correlation_id, call_stack.c_str(), line_info.c_str(),
trace_cache::get_buffer_storage().store(trace_cache::pmc_event_with_sample{
track_name.c_str(), _timing.start, event_metadata.c_str(), stack_id,
parent_stack_id, correlation_id, call_stack.c_str(), line_info.c_str(),
static_cast<uint32_t>(agent.device_id), static_cast<uint8_t>(agent.type),
track_name.c_str(), value);
track_name.c_str(), static_cast<double>(value) });
}
}
@@ -315,13 +315,12 @@ cache_sampling_data(int64_t _tid, const std::vector<timer_sampling_data>& _timer
auto _call_stack = generate_call_stack_json(iitr);
auto _line_info = generate_line_info_json(iitr);
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::backtrace_region_sample,
trace_cache::get_buffer_storage().store(trace_cache::backtrace_region_sample{
static_cast<uint32_t>(ROCPROFSYS_CATEGORY_TIMER_SAMPLING),
static_cast<uint64_t>(_thread_info->index_data->system_value),
_track_name.c_str(), _name.c_str(), itr.m_beg, itr.m_end,
trait::name<category::timer_sampling>::value, _call_stack.c_str(),
_line_info.c_str(), "{}");
_line_info.c_str(), "{}" });
}
}
@@ -348,13 +347,12 @@ cache_sampling_data(int64_t _tid, const std::vector<timer_sampling_data>& _timer
auto _call_stack = generate_call_stack_json(iitr);
auto _line_info = generate_line_info_json(iitr);
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::backtrace_region_sample,
trace_cache::get_buffer_storage().store(trace_cache::backtrace_region_sample{
static_cast<uint32_t>(ROCPROFSYS_CATEGORY_OVERFLOW_SAMPLING),
static_cast<uint64_t>(_thread_info->index_data->system_value),
_track_name.c_str(), _name.c_str(), itr.m_beg, itr.m_end,
trait::name<category::overflow_sampling>::value, _call_stack.c_str(),
_line_info.c_str(), "{}");
_line_info.c_str(), "{}" });
}
}
}