Enable VCN tracing in Perfetto output (#65)

Enable VCN activity tracing on different instances from the GPU metrics fetched using rsmi_dev_gpu_metrics_info_get in the ROCm-SMI. library.

The tracing can be controlled with ROCPROFSYS_ROCM_SMI_METRICS by setting the value as vcn_activity, Currently this configuration takes the following values: busy, temp, power, mem_usage, vcn_activity.
By default, all the 5 values will be enabled.

Signed-off-by: Sajina P Kandy <Sajina.PuthalathKandy@amd.com>
Co-authored-by: Sajina Kandy <sputhala-amd@amd.com>

[ROCm/rocprofiler-systems commit: 3fa37c991e]
This commit is contained in:
Sajina PK
2024-12-18 15:56:48 -05:00
committed by GitHub
orang tua 9be05fc673
melakukan 2d6b4d9988
11 mengubah file dengan 90 tambahan dan 17 penghapusan
@@ -58,6 +58,7 @@ GPU metrics
* Power usage
* Temperature
* Utilization
* VCN activity
CPU metrics
========================================
Binary file not shown.

After

Width:  |  Height:  |  Ukuran: 96 KiB

@@ -464,6 +464,7 @@ Viewing components
| sampling_gpu_power | GPU Power Usage via ROCm-SMI. Derived fro... |
| sampling_gpu_temp | GPU Temperature via ROCm-SMI. Derived fro... |
| sampling_gpu_busy | GPU Utilization (% busy) via ROCm-SMI. De... |
| sampling_vcn_busy | GPU VCN Utilization (% activity) via ROCm... |
| sampling_gpu_memory_usage | GPU Memory Usage via ROCm-SMI. Derived fr... |
|-----------------------------------|----------------------------------------------|
@@ -325,15 +325,23 @@ this file.
.. image:: ../data/rocprof-sys-perfetto.png
:alt: Visualization of a performance graph in Perfetto
:width: 800
.. image:: ../data/rocprof-sys-rocm.png
:alt: Visualization of ROCm data in Perfetto
:width: 800
.. image:: ../data/rocprof-sys-rocm-flow.png
:alt: Visualization of ROCm flow data in Perfetto
:width: 800
.. image:: ../data/rocprof-sys-user-api.png
:alt: Visualization of ROCm API calls in Perfetto
:width: 800
.. image:: ../data/rocprof-sys-gpu-metrics.png
:alt: Visualization of ROCm GPU metrics in Perfetto
:width: 800
Timemory output
========================================
@@ -105,6 +105,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_busy, ROCPROFSYS_CATEGORY_ROCM_SMI
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, "device_temp", "Temperature of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_vcn_activity, ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, "device_vcn_activity", "VCN Activity of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions")
ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions")
ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions")
@@ -167,6 +168,7 @@ using name = perfetto_category<Tp...>;
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_temp), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_vcn_activity), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \
ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \
ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \
@@ -82,6 +82,8 @@ struct backtrace_gpu_power
{};
struct backtrace_gpu_memory
{};
struct backtrace_gpu_vcn
{};
using sampling_wall_clock = data_tracker<double, backtrace_wall_clock>;
using sampling_cpu_clock = data_tracker<double, backtrace_cpu_clock>;
using sampling_percent = data_tracker<double, backtrace_fraction>;
@@ -89,6 +91,7 @@ using sampling_gpu_busy = data_tracker<double, backtrace_gpu_busy>;
using sampling_gpu_temp = data_tracker<double, backtrace_gpu_temp>;
using sampling_gpu_power = data_tracker<double, backtrace_gpu_power>;
using sampling_gpu_memory = data_tracker<double, backtrace_gpu_memory>;
using sampling_gpu_vcn = data_tracker<double, backtrace_gpu_vcn>;
template <typename ApiT, typename StartFuncT = default_functor_t,
typename StopFuncT = default_functor_t>
@@ -121,6 +124,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, fal
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_memory, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_vcn, false_type)
#endif
TIMEMORY_SET_COMPONENT_API(rocprofsys::component::roctracer, project::rocprofsys,
@@ -152,6 +156,9 @@ TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_temp, project::ro
tpls::rocm, device::gpu, os::supports_linux,
category::temperature, category::sampling,
category::process_sampling)
TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_vcn, project::rocprofsys,
tpls::rocm, device::gpu, os::supports_linux,
category::sampling, category::process_sampling)
TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::roctracer, "roctracer",
"High-precision ROCm API and kernel tracing", "")
@@ -180,6 +187,10 @@ TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_power,
TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_temp,
"sampling_gpu_temp", "GPU Temperature via ROCm-SMI",
"Derived from sampling")
TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_vcn,
"sampling_gpu_vcn",
"GPU VCN Utilization (% activity) via ROCm-SMI",
"Derived from sampling")
// statistics type
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_wall_clock, double)
@@ -188,6 +199,7 @@ TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_busy, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_temp, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_power, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_memory, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_vcn, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::comm_data_tracker_t, float)
// enable timing units
@@ -219,6 +231,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_busy, false
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_temp, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_power, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_memory, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_vcn, false_type)
// reporting categories (mean)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_mean, component::sampling_percent, false_type)
@@ -317,8 +317,8 @@ configure_settings(bool _init)
ROCPROFSYS_CONFIG_SETTING(
bool, "ROCPROFSYS_USE_ROCM_SMI",
"Enable sampling GPU power, temp, utilization, and memory usage", true, "backend",
"rocm_smi", "rocm", "process_sampling");
"Enable sampling GPU power, temp, utilization, vcn_activity and memory usage",
true, "backend", "rocm_smi", "rocm", "process_sampling");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_SAMPLING",
"Enable statistical sampling of call-stack", false,
@@ -626,10 +626,11 @@ configure_settings(bool _init)
rocprofiler_sdk::config_settings(_config);
ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS",
"rocm-smi metrics to collect: busy, temp, power, mem_usage",
"busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm",
"process_sampling", "advanced");
ROCPROFSYS_CONFIG_SETTING(
std::string, "ROCPROFSYS_ROCM_SMI_METRICS",
"rocm-smi metrics to collect: busy, temp, power, vcn_activity, mem_usage",
"busy,temp,power,vcn_activity,mem_usage", "backend", "rocm_smi", "rocm",
"process_sampling", "advanced");
ROCPROFSYS_CONFIG_SETTING(size_t, "ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB",
"Hint for shared-memory buffer size in perfetto (in KB)",
@@ -57,6 +57,7 @@ extern "C"
ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP,
ROCPROFSYS_CATEGORY_ROCM_SMI_POWER,
ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE,
ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY,
ROCPROFSYS_CATEGORY_ROCM_RCCL,
ROCPROFSYS_CATEGORY_SAMPLING,
ROCPROFSYS_CATEGORY_PTHREAD,
@@ -127,6 +127,7 @@ data::sample(uint32_t _dev_id)
{
auto _ts = tim::get_clock_real_now<size_t, std::nano>();
assert(_ts < std::numeric_limits<int64_t>::max());
rsmi_gpu_metrics_t _gpu_metrics;
auto _state = get_state().load();
@@ -159,6 +160,13 @@ data::sample(uint32_t _dev_id)
&m_power, &power_type)
ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).mem_usage, rsmi_dev_memory_usage_get,
_dev_id, RSMI_MEM_TYPE_VRAM, &m_mem_usage);
ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).vcn_activity,
rsmi_dev_gpu_metrics_info_get, _dev_id, &_gpu_metrics);
for(const auto& activity : _gpu_metrics.vcn_activity)
{
if(activity != UINT16_MAX) m_vcn_metrics.push_back(activity);
}
#undef ROCPROFSYS_RSMI_GET
}
@@ -257,6 +265,7 @@ data::post_process(uint32_t _dev_id)
using component::sampling_gpu_memory;
using component::sampling_gpu_power;
using component::sampling_gpu_temp;
using component::sampling_gpu_vcn;
if(device_count < _dev_id) return;
@@ -273,7 +282,7 @@ data::post_process(uint32_t _dev_id)
auto _settings = get_settings(_dev_id);
auto _process_perfetto = [&]() {
auto _idx = std::array<uint64_t, 4>{};
auto _idx = std::array<uint64_t, 5>{};
{
_idx.fill(_idx.size());
uint64_t nidx = 0;
@@ -281,6 +290,7 @@ data::post_process(uint32_t _dev_id)
if(_settings.temp) _idx.at(1) = nidx++;
if(_settings.power) _idx.at(2) = nidx++;
if(_settings.mem_usage) _idx.at(3) = nidx++;
if(_settings.vcn_activity) _idx.at(4) = nidx++;
}
for(auto& itr : _rocm_smi)
@@ -301,6 +311,14 @@ data::post_process(uint32_t _dev_id)
if(_settings.mem_usage)
counter_track::emplace(_dev_id, addendum("Memory Usage"),
"megabytes");
if(_settings.vcn_activity)
{
for(std::size_t i = 0; i < std::size(itr.m_vcn_metrics); ++i)
counter_track::emplace(
_dev_id,
addendum(("VCN Activity on " + std::to_string(i)).c_str()),
"%");
}
}
uint64_t _ts = itr.m_ts;
if(!_thread_info->is_valid_time(_ts)) continue;
@@ -322,6 +340,16 @@ data::post_process(uint32_t _dev_id)
if(_settings.mem_usage)
TRACE_COUNTER("device_memory_usage",
counter_track::at(_dev_id, _idx.at(3)), _ts, _usage);
if(_settings.vcn_activity)
{
uint64_t idx = _idx.at(4);
for(const auto& temp : itr.m_vcn_metrics)
{
TRACE_COUNTER("device_vcn_activity", counter_track::at(_dev_id, idx),
_ts, temp);
++idx;
}
}
}
};
@@ -411,6 +439,7 @@ setup()
key_pair_t{ "temp", get_settings(dev_id).temp },
key_pair_t{ "power", get_settings(dev_id).power },
key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage },
key_pair_t{ "vcn_activity", get_settings(dev_id).vcn_activity },
};
get_settings(dev_id) = { false, false, false, false };
@@ -491,3 +520,7 @@ ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_memory>), true,
double)
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_vcn>), true,
double)
@@ -71,10 +71,11 @@ device_count();
struct settings
{
bool busy = true;
bool temp = true;
bool power = true;
bool mem_usage = true;
bool busy = true;
bool temp = true;
bool power = true;
bool mem_usage = true;
bool vcn_activity = true;
};
struct data
@@ -99,12 +100,13 @@ struct data
static void post_process(uint32_t _dev_id);
uint32_t m_dev_id = std::numeric_limits<uint32_t>::max();
timestamp_t m_ts = 0;
busy_perc_t m_busy_perc = 0;
temp_t m_temp = 0;
power_t m_power = 0;
mem_usage_t m_mem_usage = 0;
uint32_t m_dev_id = std::numeric_limits<uint32_t>::max();
timestamp_t m_ts = 0;
busy_perc_t m_busy_perc = 0;
temp_t m_temp = 0;
power_t m_power = 0;
mem_usage_t m_mem_usage = 0;
std::vector<uint16_t> m_vcn_metrics = {};
friend std::ostream& operator<<(std::ostream& _os, const data& _v)
{
@@ -179,5 +181,9 @@ ROCPROFSYS_DECLARE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_memory>), true,
double)
ROCPROFSYS_DECLARE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_vcn>), true,
double)
# endif
#endif
@@ -129,6 +129,7 @@ using component::sampling_gpu_busy;
using component::sampling_gpu_memory;
using component::sampling_gpu_power;
using component::sampling_gpu_temp;
using component::sampling_gpu_vcn;
using component::sampling_percent;
using component::sampling_wall_clock;
} // namespace sampling
@@ -1572,6 +1573,12 @@ struct sampling_initialization
sampling_gpu_temp::display_unit() = "degC";
sampling_gpu_temp::set_precision(1);
sampling_gpu_temp::set_format_flags(sampling_gpu_temp::get_format_flags());
sampling_gpu_vcn::label() = "sampling_gpu_vcn_percent";
sampling_gpu_vcn::description() = "Utilization of VCN(s)";
sampling_gpu_vcn::set_precision(0);
sampling_gpu_vcn::set_format_flags(sampling_gpu_vcn::get_format_flags() &
std::ios_base::showpoint);
}
};
} // namespace