From b86b8e165aecdb34aba9b50c87c0cc9931d65de4 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Wed, 14 Feb 2024 19:20:04 -0600 Subject: [PATCH] fix: [rocm/rocm_smi_lib] rsmi_dev_activity_metric_get gfx/memory activity does not update with GPU activity Checks and forces rereading gpu metrics unconditionally Code changes related to the following: * Device::dev_log_gpu_metrics() * Examples * Unit tests Change-Id: Ic1c4f34a39f2bf197263f80ddbb84da26345807d Signed-off-by: Oliveira, Daniel [ROCm/rocm_smi_lib commit: b4d37caa70aadf17e7cf0de48ef053b94f104388] --- .../rocm_smi/example/rocm_smi_example.cc | 17 ++ .../rocm-smi-lib/src/rocm_smi_gpu_metrics.cc | 167 ++++++++---------- .../functional/gpu_metrics_read.cc | 31 ++-- 3 files changed, 116 insertions(+), 99 deletions(-) diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index 5ea30bf5bf..84608e7259 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -1000,6 +1000,23 @@ int main() { for (const auto& dclk : gpu_metrics.current_dclk0s) { std::cout << "\t -> " << std::dec << dclk << "\n"; } + + std::cout << "\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + rsmi_gpu_metrics_t gpu_metrics_check; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n"; + } + + std::cout << "\n\n"; std::cout << " ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n"; diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index 2f1a40837a..f3697838f7 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -2680,47 +2680,42 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data() LOG_TRACE(ostrstream); // Check if/when metrics table needs to be refreshed. - if ((!m_gpu_metrics_header.m_structure_size) || - (!m_gpu_metrics_header.m_format_revision) || - (!m_gpu_metrics_header.m_content_revision)) { - auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, - sizeof(AMDGpuMetricsHeader_v1_t), - &m_gpu_metrics_header); - if ((status_code = ErrnoToRsmiStatus(op_result)) != - rsmi_status_t::RSMI_STATUS_SUCCESS) { - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: readDevInfo(kDevGpuMetrics)" - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } - if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) == - rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: gpu metric file version is not supported: " - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } - - m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); + auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, + sizeof(AMDGpuMetricsHeader_v1_t), + &m_gpu_metrics_header); + if ((status_code = ErrnoToRsmiStatus(op_result)) != + rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: readDevInfo(kDevGpuMetrics)" + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } + if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) == + rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: gpu metric file version is not supported: " + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -2842,23 +2837,21 @@ rsmi_status_t Device::setup_gpu_metrics_reading() } // - // if/in case setup_gpu_metrics_reading() was called already use the same pointer + m_gpu_metrics_ptr.reset(); + m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); if (!m_gpu_metrics_ptr) { - m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); - if (!m_gpu_metrics_ptr) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } // @@ -2938,23 +2931,21 @@ rsmi_status_t Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics) // meaning, we didn't run any queries, and just want to // print all the gpu metrics content, we need to setup // the environment first. - if (!m_gpu_metrics_ptr) { - status_code = setup_gpu_metrics_reading(); - if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { - // At this point we should have a valid gpu_metrics pointer. - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } + status_code = setup_gpu_metrics_reading(); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { + // At this point we should have a valid gpu_metrics pointer. + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } // Header info @@ -3100,22 +3091,20 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ostrstream); - if (!m_gpu_metrics_ptr) { - status_code = setup_gpu_metrics_reading(); - if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } + status_code = setup_gpu_metrics_reading(); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } // Lookup the dynamic table diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc index 2d7bde7c67..e7c5306ad0 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -120,14 +120,6 @@ auto print_error_or_value(std::string title, std::string func_name, const T& met } }; -template -std::string print_unsigned_int(T value) { - std::stringstream ss; - ss << static_cast(value | 0); - - return ss.str(); -} - void TestGpuMetricsRead::Run(void) { rsmi_status_t err; @@ -148,8 +140,8 @@ void TestGpuMetricsRead::Run(void) { auto ret = rsmi_dev_metrics_header_info_get(i, &header_values); if (ret == rsmi_status_t::RSMI_STATUS_SUCCESS) { std::cout << "\t[Metrics Header]" << "\n"; - std::cout << "\t -> format_revision : " << print_unsigned_int(header_values.format_revision) << "\n"; - std::cout << "\t -> content_revision : " << print_unsigned_int(header_values.content_revision) << "\n"; + std::cout << "\t -> format_revision : " << amd::smi::print_unsigned_int(header_values.format_revision) << "\n"; + std::cout << "\t -> content_revision : " << amd::smi::print_unsigned_int(header_values.content_revision) << "\n"; std::cout << "\t--------------------" << "\n"; } } @@ -256,6 +248,25 @@ void TestGpuMetricsRead::Run(void) { for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) { std::cout << "\tcurrent_dclk0s[" << i << "]=" << std::dec << smu.current_dclk0s[i] << '\n'; } + + + std::cout << "\n\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + rsmi_gpu_metrics_t gpu_metrics_check; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n"; + } + + std::cout << "\n"; + std::cout << " ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n\n"; } }