fix: [rocm/rocm_smi_lib] rsmi_dev_activity_metric_get gfx/memory activity does not update with GPU activity
Checks and forces rereading gpu metrics unconditionally
Code changes related to the following:
* Device::dev_log_gpu_metrics()
* Examples
* Unit tests
Change-Id: Ic1c4f34a39f2bf197263f80ddbb84da26345807d
Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
[ROCm/rocm_smi_lib commit: b4d37caa70]
This commit is contained in:
@@ -1000,6 +1000,23 @@ int main() {
|
||||
for (const auto& dclk : gpu_metrics.current_dclk0s) {
|
||||
std::cout << "\t -> " << std::dec << dclk << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
|
||||
constexpr uint16_t kMAX_ITER_TEST = 10;
|
||||
rsmi_gpu_metrics_t gpu_metrics_check;
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check);
|
||||
std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check);
|
||||
std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n\n";
|
||||
std::cout << " ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n";
|
||||
|
||||
|
||||
|
||||
@@ -2680,47 +2680,42 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data()
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
// Check if/when metrics table needs to be refreshed.
|
||||
if ((!m_gpu_metrics_header.m_structure_size) ||
|
||||
(!m_gpu_metrics_header.m_format_revision) ||
|
||||
(!m_gpu_metrics_header.m_content_revision)) {
|
||||
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
|
||||
sizeof(AMDGpuMetricsHeader_v1_t),
|
||||
&m_gpu_metrics_header);
|
||||
if ((status_code = ErrnoToRsmiStatus(op_result)) !=
|
||||
rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: readDevInfo(kDevGpuMetrics)"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) ==
|
||||
rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: gpu metric file version is not supported: "
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs();
|
||||
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
|
||||
sizeof(AMDGpuMetricsHeader_v1_t),
|
||||
&m_gpu_metrics_header);
|
||||
if ((status_code = ErrnoToRsmiStatus(op_result)) !=
|
||||
rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: readDevInfo(kDevGpuMetrics)"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) ==
|
||||
rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: gpu metric file version is not supported: "
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs();
|
||||
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
@@ -2842,23 +2837,21 @@ rsmi_status_t Device::setup_gpu_metrics_reading()
|
||||
}
|
||||
|
||||
//
|
||||
// if/in case setup_gpu_metrics_reading() was called already use the same pointer
|
||||
m_gpu_metrics_ptr.reset();
|
||||
m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version);
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version);
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
//
|
||||
@@ -2938,23 +2931,21 @@ rsmi_status_t Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics)
|
||||
// meaning, we didn't run any queries, and just want to
|
||||
// print all the gpu metrics content, we need to setup
|
||||
// the environment first.
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
// At this point we should have a valid gpu_metrics pointer.
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
// At this point we should have a valid gpu_metrics pointer.
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
// Header info
|
||||
@@ -3100,22 +3091,20 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met
|
||||
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
// Lookup the dynamic table
|
||||
|
||||
@@ -120,14 +120,6 @@ auto print_error_or_value(std::string title, std::string func_name, const T& met
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::string print_unsigned_int(T value) {
|
||||
std::stringstream ss;
|
||||
ss << static_cast<uint64_t>(value | 0);
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void TestGpuMetricsRead::Run(void) {
|
||||
rsmi_status_t err;
|
||||
|
||||
@@ -148,8 +140,8 @@ void TestGpuMetricsRead::Run(void) {
|
||||
auto ret = rsmi_dev_metrics_header_info_get(i, &header_values);
|
||||
if (ret == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::cout << "\t[Metrics Header]" << "\n";
|
||||
std::cout << "\t -> format_revision : " << print_unsigned_int(header_values.format_revision) << "\n";
|
||||
std::cout << "\t -> content_revision : " << print_unsigned_int(header_values.content_revision) << "\n";
|
||||
std::cout << "\t -> format_revision : " << amd::smi::print_unsigned_int(header_values.format_revision) << "\n";
|
||||
std::cout << "\t -> content_revision : " << amd::smi::print_unsigned_int(header_values.content_revision) << "\n";
|
||||
std::cout << "\t--------------------" << "\n";
|
||||
}
|
||||
}
|
||||
@@ -256,6 +248,25 @@ void TestGpuMetricsRead::Run(void) {
|
||||
for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) {
|
||||
std::cout << "\tcurrent_dclk0s[" << i << "]=" << std::dec << smu.current_dclk0s[i] << '\n';
|
||||
}
|
||||
|
||||
|
||||
std::cout << "\n\n";
|
||||
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
|
||||
constexpr uint16_t kMAX_ITER_TEST = 10;
|
||||
rsmi_gpu_metrics_t gpu_metrics_check;
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check);
|
||||
std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check);
|
||||
std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
std::cout << " ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user