[rocprof-sys] Fix RCCL comm_data counters in rocpd output (#2607)

## Motivation

<!-- Explain the purpose of this PR and the goals it aims to achieve. -->
The validate-rccl-* tests were failing because "RCCL Comm" counters were not being written to perfetto traces when using the new cached-perfetto approach.

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->
Root Cause: The write_perfetto_counter_track() in rccl.cpp was only called when config::get_use_perfetto() returned true, which requires ROCPROFSYS_TRACE_LEGACY=ON. This meant RCCL counters weren't captured with the new trace cache approach.

Solution: Integrated RCCL with the trace cache system:

Changes to source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp:

- Added cache_rccl_comm_data_events<Track>() function to store RCCL comm data via pmc_event_with_sample with category::comm_data
- Modified tool_tracing_callback_rccl() to always cache events for new perfetto approach, while preserving legacy write_perfetto_counter_track() calls for backward compatibility

Changes to tests/rocprof-sys-testing.cmake:

- Added rccl_api to ROCPROFSYS_ROCM_DOMAINS to enable RCCL API callback tracing

Handler verification: The perfetto_processor_t already has a handler for ROCPROFSYS_CATEGORY_COMM_DATA in m_pmc_track_map that processes the cached events.
This commit is contained in:
habajpai-amd
2026-01-23 02:08:19 +05:30
committato da GitHub
parent 9caa248bfd
commit 7e74d163fd
@@ -22,13 +22,19 @@
#include "library/rocprofiler-sdk/rccl.hpp"
#include "core/categories.hpp"
#include "core/components/fwd.hpp"
#include "core/config.hpp"
#include "core/perfetto.hpp"
#include "core/trace_cache/cache_manager.hpp"
#include "core/trace_cache/sample_type.hpp"
#include "library/tracing.hpp"
#include "logger/debug.hpp"
#include <mutex>
namespace rocprofsys
{
namespace rocprofiler_sdk
@@ -70,6 +76,33 @@ write_perfetto_counter_track(uint64_t _val, uint64_t _begin_ts, uint64_t _end_ts
}
}
template <typename Track>
void
cache_rccl_comm_data_events(size_t bytes, uint64_t timestamp_ns)
{
static std::mutex _mutex{};
static uint64_t cumulative_bytes = 0;
{
std::unique_lock<std::mutex> _lk{ _mutex };
bytes = (cumulative_bytes += bytes);
}
const std::string track_name = Track::label;
const std::string event_metadata = "{}";
const size_t stack_id = 0;
const size_t parent_stack_id = 0;
const size_t correlation_id = 0;
const std::string call_stack = "{}";
const std::string line_info = "{}";
const uint32_t device_id = 0;
trace_cache::get_buffer_storage().store(trace_cache::pmc_event_with_sample{
static_cast<size_t>(category_enum_id<category::comm_data>::value),
track_name.c_str(), timestamp_ns, event_metadata.c_str(), stack_id,
parent_stack_id, correlation_id, call_stack.c_str(), line_info.c_str(), device_id,
static_cast<uint8_t>(agent_type::CPU), track_name.c_str(),
static_cast<double>(cumulative_bytes) });
}
static auto
rccl_type_size(ncclDataType_t datatype)
{
@@ -172,12 +205,18 @@ tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record,
break;
}
if(config::get_use_perfetto() && size > 0)
if(size > 0)
{
if(is_send)
{
cache_rccl_comm_data_events<rccl_send>(size, end_ts);
write_perfetto_counter_track<rccl_send>(size, begin_ts, end_ts);
}
else
{
cache_rccl_comm_data_events<rccl_recv>(size, end_ts);
write_perfetto_counter_track<rccl_recv>(size, begin_ts, end_ts);
}
}
}
}