Show VCN and JPEG busy values where VCN/JPEG activity is not supported. (#232)
On AMD-SMI, in rocm 7.0, vcn_activity and jpeg_activity will not be reported when XCP (partition) stats, vcn_busy and jpeg_busy, are available. This causes the activity tracking to fail. The fix is to read the busy values when activity values are not supported. For issue: SWDEV-536439 --------- Co-authored-by: David Galiffi <David.Galiffi@amd.com>
此提交包含在:
+34
-37
@@ -73,6 +73,17 @@ check_amdsmi_error(amdsmi_status_t _code, const char* _file, int _line)
|
||||
_msg);
|
||||
}
|
||||
|
||||
// Ensures initialization happens only once
|
||||
std::once_flag amdsmi_once;
|
||||
|
||||
// Tracks whether AMD SMI is initialized
|
||||
bool&
|
||||
_amdsmi_is_initialized()
|
||||
{
|
||||
static bool initialized = false;
|
||||
return initialized;
|
||||
}
|
||||
|
||||
bool
|
||||
amdsmi_init()
|
||||
{
|
||||
@@ -82,10 +93,12 @@ amdsmi_init()
|
||||
// Currently, only AMDSMI_INIT_AMD_GPUS is supported
|
||||
ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS));
|
||||
get_processor_handles();
|
||||
_amdsmi_is_initialized() = true; // Mark as initialized
|
||||
} catch(std::exception& _e)
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE(1, "Exception thrown initializing amd-smi: %s\n",
|
||||
_e.what());
|
||||
_amdsmi_is_initialized() = false; // Mark as not initialized
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -143,7 +156,9 @@ bool
|
||||
initialize_amdsmi()
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
return (amdsmi_init()) ? true : false;
|
||||
// Ensure initialization happens only once
|
||||
std::call_once(amdsmi_once, amdsmi_init);
|
||||
return _amdsmi_is_initialized();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
@@ -252,45 +267,27 @@ get_processor_handles()
|
||||
processors::processors_list.push_back(processor);
|
||||
|
||||
amdsmi_gpu_metrics_t gpu_metrics;
|
||||
bool vcn_supported = false;
|
||||
bool jpeg_supported = false;
|
||||
bool v_busy_supported = false;
|
||||
bool j_busy_supported = false;
|
||||
ret = amdsmi_get_gpu_metrics_info(processor, &gpu_metrics);
|
||||
if(ret == AMDSMI_STATUS_SUCCESS)
|
||||
bool vcn_supported = false, jpeg_supported = false;
|
||||
bool v_busy_supported = false, j_busy_supported = false;
|
||||
// AMD SMI will not report VCN_activity and JPEG_activity, if VCN_busy or
|
||||
// JPEG_busy fields are available.
|
||||
if(amdsmi_get_gpu_metrics_info(processor, &gpu_metrics) ==
|
||||
AMDSMI_STATUS_SUCCESS)
|
||||
{
|
||||
for(const auto& vcn_activity : gpu_metrics.vcn_activity)
|
||||
{
|
||||
if(vcn_activity != UINT16_MAX)
|
||||
{
|
||||
vcn_supported = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for(const auto& jpeg_activity : gpu_metrics.jpeg_activity)
|
||||
{
|
||||
if(jpeg_activity != UINT16_MAX)
|
||||
{
|
||||
jpeg_supported = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Helper lambda to check if any value in the array is valid
|
||||
auto has_valid = [](const auto& arr) {
|
||||
return std::any_of(std::begin(arr), std::end(arr),
|
||||
[](auto val) { return val != UINT16_MAX; });
|
||||
};
|
||||
vcn_supported = has_valid(gpu_metrics.vcn_activity);
|
||||
jpeg_supported = has_valid(gpu_metrics.jpeg_activity);
|
||||
// Check if VCN and JPEG busy metrics are available
|
||||
for(const auto& xcp : gpu_metrics.xcp_stats)
|
||||
{
|
||||
if(!v_busy_supported)
|
||||
{
|
||||
v_busy_supported =
|
||||
std::any_of(std::begin(xcp.vcn_busy), std::end(xcp.vcn_busy),
|
||||
[](uint16_t val) { return val != UINT16_MAX; });
|
||||
}
|
||||
|
||||
if(!j_busy_supported)
|
||||
{
|
||||
j_busy_supported = std::any_of(
|
||||
std::begin(xcp.jpeg_busy), std::end(xcp.jpeg_busy),
|
||||
[](uint16_t val) { return val != UINT16_MAX; });
|
||||
}
|
||||
|
||||
if(!v_busy_supported && has_valid(xcp.vcn_busy))
|
||||
v_busy_supported = true;
|
||||
if(!j_busy_supported && has_valid(xcp.jpeg_busy))
|
||||
j_busy_supported = true;
|
||||
if(v_busy_supported && j_busy_supported) break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,6 +143,7 @@ data::sample(uint32_t _dev_id)
|
||||
auto _ts = tim::get_clock_real_now<size_t, std::nano>();
|
||||
assert(_ts < std::numeric_limits<int64_t>::max());
|
||||
amdsmi_gpu_metrics_t _gpu_metrics;
|
||||
bool _vcn_or_jpeg_activity_enabled = false;
|
||||
|
||||
auto _state = get_state().load();
|
||||
|
||||
@@ -184,18 +185,59 @@ data::sample(uint32_t _dev_id)
|
||||
#endif
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).mem_usage, amdsmi_get_gpu_memory_usage,
|
||||
sample_handle, AMDSMI_MEM_TYPE_VRAM, &m_mem_usage);
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).vcn_activity,
|
||||
amdsmi_get_gpu_metrics_info, sample_handle, &_gpu_metrics);
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).jpeg_activity,
|
||||
amdsmi_get_gpu_metrics_info, sample_handle, &_gpu_metrics);
|
||||
_vcn_or_jpeg_activity_enabled =
|
||||
get_settings(m_dev_id).vcn_activity || get_settings(m_dev_id).jpeg_activity;
|
||||
ROCPROFSYS_AMDSMI_GET(_vcn_or_jpeg_activity_enabled, amdsmi_get_gpu_metrics_info,
|
||||
sample_handle, &_gpu_metrics);
|
||||
|
||||
for(const auto& v_activity : _gpu_metrics.vcn_activity)
|
||||
// Process metrics if either VCN or JPEG activity is enabled
|
||||
if(_vcn_or_jpeg_activity_enabled)
|
||||
{
|
||||
if(v_activity != UINT16_MAX) m_vcn_metrics.push_back(v_activity);
|
||||
}
|
||||
for(const auto& j_activity : _gpu_metrics.jpeg_activity)
|
||||
{
|
||||
if(j_activity != UINT16_MAX) m_jpeg_metrics.push_back(j_activity);
|
||||
// Helper lambda to fill busy metrics from a source array
|
||||
auto fill_busy_metrics = [](auto& dest, const auto& src) {
|
||||
for(const auto& val : src)
|
||||
{
|
||||
if(val != UINT16_MAX) dest.push_back(val);
|
||||
}
|
||||
};
|
||||
|
||||
if(gpu::is_vcn_activity_supported(m_dev_id) &&
|
||||
gpu::is_jpeg_activity_supported(m_dev_id))
|
||||
{
|
||||
// Both VCN and JPEG are supported - create one entry with both metrics
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.vcn_busy, _gpu_metrics.vcn_activity);
|
||||
fill_busy_metrics(metrics.jpeg_busy, _gpu_metrics.jpeg_activity);
|
||||
if(!metrics.vcn_busy.empty() || !metrics.jpeg_busy.empty())
|
||||
m_xcp_metrics.push_back(metrics);
|
||||
}
|
||||
else if(gpu::is_vcn_activity_supported(m_dev_id))
|
||||
{
|
||||
// Only VCN is supported
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.vcn_busy, _gpu_metrics.vcn_activity);
|
||||
if(!metrics.vcn_busy.empty()) m_xcp_metrics.push_back(metrics);
|
||||
}
|
||||
else if(gpu::is_jpeg_activity_supported(m_dev_id))
|
||||
{
|
||||
// Only JPEG is supported
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.jpeg_busy, _gpu_metrics.jpeg_activity);
|
||||
if(!metrics.jpeg_busy.empty()) m_xcp_metrics.push_back(metrics);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither is supported - use XCP stats
|
||||
// Each XCP gets one entry with both its VCN and JPEG metrics
|
||||
for(const auto& xcp : _gpu_metrics.xcp_stats)
|
||||
{
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.vcn_busy, xcp.vcn_busy);
|
||||
fill_busy_metrics(metrics.jpeg_busy, xcp.jpeg_busy);
|
||||
if(!metrics.vcn_busy.empty() || !metrics.jpeg_busy.empty())
|
||||
m_xcp_metrics.push_back(metrics);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef ROCPROFSYS_AMDSMI_GET
|
||||
@@ -350,16 +392,19 @@ data::post_process(uint32_t _dev_id)
|
||||
auto addendum = [&](const char* _v) {
|
||||
return JOIN(" ", "GPU", _v, JOIN("", '[', _dev_id, ']'), "(S)");
|
||||
};
|
||||
auto addendum_blk = [&](std::size_t _i, const char* _metric) {
|
||||
if(_i < 10)
|
||||
auto addendum_blk = [&](std::size_t _i, const char* _metric,
|
||||
std::size_t xcp_idx = SIZE_MAX) {
|
||||
if(xcp_idx != SIZE_MAX)
|
||||
{
|
||||
return JOIN(" ", "GPU", JOIN("", '[', _dev_id, ']'), _metric,
|
||||
JOIN("", "[0", _i, ']'), "(S)");
|
||||
JOIN("", "XCP_", xcp_idx, ": [", (_i < 10 ? "0" : ""),
|
||||
_i, ']'),
|
||||
"(S)");
|
||||
}
|
||||
else
|
||||
{
|
||||
return JOIN(" ", "GPU", JOIN("", '[', _dev_id, ']'), _metric,
|
||||
JOIN("", '[', _i, ']'), "(S)");
|
||||
JOIN("", "[", (_i < 10 ? "0" : ""), _i, ']'), "(S)");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -378,15 +423,61 @@ data::post_process(uint32_t _dev_id)
|
||||
"megabytes");
|
||||
if(_settings.vcn_activity)
|
||||
{
|
||||
for(std::size_t i = 0; i < std::size(itr.m_vcn_metrics); ++i)
|
||||
counter_track::emplace(_dev_id, addendum_blk(i, " VCN Activity"),
|
||||
"%");
|
||||
if(itr.m_xcp_metrics.empty())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(
|
||||
1, "No VCN activity data collected from device %u\n",
|
||||
_dev_id);
|
||||
}
|
||||
else if(gpu::is_vcn_activity_supported(_dev_id))
|
||||
{
|
||||
// For VCN activity, use simple indexing
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_xcp_metrics[0].vcn_busy); ++i)
|
||||
counter_track::emplace(_dev_id,
|
||||
addendum_blk(i, "VCN Activity"), "%");
|
||||
}
|
||||
else
|
||||
{
|
||||
for(std::size_t xcp = 0; xcp < std::size(itr.m_xcp_metrics);
|
||||
++xcp)
|
||||
{
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_xcp_metrics[xcp].vcn_busy); ++i)
|
||||
{
|
||||
counter_track::emplace(
|
||||
_dev_id, addendum_blk(i, "VCN Activity", xcp), "%");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(_settings.jpeg_activity)
|
||||
{
|
||||
for(std::size_t i = 0; i < std::size(itr.m_jpeg_metrics); ++i)
|
||||
counter_track::emplace(_dev_id, addendum_blk(i, "JPEG Activity"),
|
||||
"%");
|
||||
if(itr.m_xcp_metrics.empty())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(
|
||||
1, "No JPEG activity data collected from device %u\n",
|
||||
_dev_id);
|
||||
}
|
||||
else if(gpu::is_jpeg_activity_supported(_dev_id))
|
||||
{
|
||||
// For JPEG activity, use simple indexing
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_xcp_metrics[0].jpeg_busy); ++i)
|
||||
counter_track::emplace(_dev_id,
|
||||
addendum_blk(i, "JPEG Activity"), "%");
|
||||
}
|
||||
else
|
||||
{
|
||||
for(std::size_t xcp = 0; xcp < std::size(itr.m_xcp_metrics);
|
||||
++xcp)
|
||||
{
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_xcp_metrics[xcp].jpeg_busy); ++i)
|
||||
counter_track::emplace(
|
||||
_dev_id, addendum_blk(i, "JPEG Activity", xcp), "%");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
uint64_t _ts = itr.m_ts;
|
||||
@@ -417,25 +508,41 @@ data::post_process(uint32_t _dev_id)
|
||||
if(_settings.mem_usage)
|
||||
TRACE_COUNTER("device_memory_usage",
|
||||
counter_track::at(_dev_id, _idx.at(5)), _ts, _usage);
|
||||
if(_settings.vcn_activity)
|
||||
if(_settings.vcn_activity && !itr.m_xcp_metrics.empty())
|
||||
{
|
||||
uint64_t idx = _idx.at(6);
|
||||
for(const auto& temp : itr.m_vcn_metrics)
|
||||
// Iterate over all XCPs and their VCN busy/activity values
|
||||
for(const auto& metrics : itr.m_xcp_metrics)
|
||||
{
|
||||
TRACE_COUNTER("device_vcn_activity", counter_track::at(_dev_id, idx),
|
||||
_ts, temp);
|
||||
++idx;
|
||||
for(const auto& vcn_val : metrics.vcn_busy)
|
||||
{
|
||||
TRACE_COUNTER("device_vcn_activity",
|
||||
counter_track::at(_dev_id, idx), _ts, vcn_val);
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(_settings.jpeg_activity)
|
||||
|
||||
if(_settings.jpeg_activity && !itr.m_xcp_metrics.empty())
|
||||
{
|
||||
uint64_t idx = _idx.at(7);
|
||||
if(_settings.vcn_activity) idx += (itr.m_vcn_metrics.size() - 1);
|
||||
for(const auto& temp : itr.m_jpeg_metrics)
|
||||
// Calculate total VCN metrics to properly offset JPEG metrics index
|
||||
if(_settings.vcn_activity)
|
||||
{
|
||||
TRACE_COUNTER("device_jpeg_activity", counter_track::at(_dev_id, idx),
|
||||
_ts, temp);
|
||||
++idx;
|
||||
size_t total_vcn_metrics = 0;
|
||||
for(const auto& metrics : itr.m_xcp_metrics)
|
||||
total_vcn_metrics += metrics.vcn_busy.size();
|
||||
if(total_vcn_metrics > 0) idx += (total_vcn_metrics - 1);
|
||||
}
|
||||
// Iterate over all XCPs and their JPEG busy/activity values
|
||||
for(const auto& metrics : itr.m_xcp_metrics)
|
||||
{
|
||||
for(const auto& jpeg_val : metrics.jpeg_busy)
|
||||
{
|
||||
TRACE_COUNTER("device_jpeg_activity",
|
||||
counter_track::at(_dev_id, idx), _ts, jpeg_val);
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +93,12 @@ struct data
|
||||
using mem_usage_t = uint64_t;
|
||||
using temp_t = int64_t;
|
||||
|
||||
struct xcp_metrics_t
|
||||
{
|
||||
std::vector<uint16_t> vcn_busy;
|
||||
std::vector<uint16_t> jpeg_busy;
|
||||
};
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(data)
|
||||
|
||||
explicit data(uint32_t _dev_id);
|
||||
@@ -102,12 +108,11 @@ struct data
|
||||
|
||||
static void post_process(uint32_t _dev_id);
|
||||
|
||||
uint32_t m_dev_id = std::numeric_limits<uint32_t>::max();
|
||||
timestamp_t m_ts = 0;
|
||||
temp_t m_temp = 0;
|
||||
mem_usage_t m_mem_usage = 0;
|
||||
std::vector<uint16_t> m_vcn_metrics = {};
|
||||
std::vector<uint16_t> m_jpeg_metrics = {};
|
||||
uint32_t m_dev_id = std::numeric_limits<uint32_t>::max();
|
||||
timestamp_t m_ts = 0;
|
||||
temp_t m_temp = 0;
|
||||
mem_usage_t m_mem_usage = 0;
|
||||
std::vector<xcp_metrics_t> m_xcp_metrics = {};
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
amdsmi_engine_usage_t m_busy_perc = {};
|
||||
amdsmi_power_info_t m_power = {};
|
||||
|
||||
新增問題並參考
封鎖使用者