From 1bf68ad1c93e7a3055c49bc155dfca28f25cf32e Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 19 Sep 2023 13:44:20 -0500 Subject: [PATCH] rocm_smi_lib: Fix [linux BM] [AMDSMI] Memory Bandwidth Implements APIs for 'gpu_metrics_v1_3' utilization averages Code changes related to the following: * rsmi_dev_activity_metric_get() * rsmi_dev_activity_avg_mm_get() * CLI shows "Avg.Memory Bandwidth" under "--showmemuse" Change-Id: I8e4600f350a7c18499abf022534db2b875f09d5f Signed-off-by: Oliveira, Daniel [ROCm/rocm_smi_lib commit: e0483f2ee292b2d8b3b15f3ee5cbf24656976a19] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 77 +++++++++++- .../rocm-smi-lib/python_smi_tools/rocm_smi.py | 7 ++ projects/rocm-smi-lib/src/rocm_smi.cc | 115 +++++++++++++++++- .../rocm-smi-lib/src/rocm_smi_gpu_metrics.cc | 6 +- .../functional/gpu_metrics_read.cc | 4 +- 5 files changed, 202 insertions(+), 7 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 92ac970841..14e4db0d58 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -466,6 +466,19 @@ typedef enum { RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; +/** + * @brief Activity (Utilization) Metrics. This enum is used to identify + * various activity metrics. + * + */ +typedef enum { + /* Utilization */ + RSMI_ACTIVITY_GFX = (0x1 << 0), + RSMI_ACTIVITY_UMC = (0x1 << 1), //!< memory controller + RSMI_ACTIVITY_MM = (0x1 << 2) //!< UVD or VCN +} rsmi_activity_metric_t; + + /** * @brief Voltage Metrics. This enum is used to identify various * Volatge metrics. Corresponding values will be in millivolt. @@ -774,6 +787,17 @@ typedef struct { typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth; /// \endcond +/** + * @brief This structure holds information about the possible activity + * averages. Specifically, the utilization counters. + */ +typedef struct { + /* Utilization */ + uint16_t average_gfx_activity; + uint16_t average_umc_activity; //!< memory controller + uint16_t average_mm_activity; //!< UVD or VCN +} rsmi_activity_metric_counter_t; + /** * @brief This structure holds version information. */ @@ -964,7 +988,7 @@ typedef struct { uint16_t padding; // new in v1 uint32_t gfx_activity_acc; // new in v1 - uint32_t mem_actvity_acc; // new in v1 + uint32_t mem_activity_acc; // new in v1 uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1 /// \endcond } rsmi_gpu_metrics_t; @@ -2259,6 +2283,57 @@ rsmi_utilization_count_get(uint32_t dv_ind, uint32_t count, uint64_t *timestamp); +/** + * @brief Get activity metric average utilization counter of the specified device + * + * @details Given a device index @p dv_ind, the activity metric type, + * this function returns the requested utilization counters + * + * @param[in] dv_ind a device index + * + * @param[in] activity_metric_type a metric type + * + * @param[inout] activity_metric_counter Multiple utilization counters can be retrieved with a single + * call. The caller must allocate enough space to the rsmi_activity_metric_counter_t structure. + * + * If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding + * field of the counter will be set in the value field of + * the activity_metric_counter_t. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t +rsmi_dev_activity_metric_get(uint32_t dv_ind, + rsmi_activity_metric_t activity_metric_type, + rsmi_activity_metric_counter_t* activity_metric_counter); + +/** + * @brief Get activity metric bandwidth average utilization counter of the specified device + * + * @details Given a device index @p dv_ind, the activity metric type, + * this function returns the requested utilization counters + * + * @param[in] dv_ind a device index + * + * @param[inout] avg_activity average bandwidth utilization counters can be retrieved + * + * If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding + * field of the counter will be set in the value field of + * the activity_metric_counter_t. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t +rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity); + /** * @brief Get the performance level of the device with provided * device index. diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 4d1a5bad41..2a0a4655d7 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -2160,6 +2160,7 @@ def showMemUse(deviceList): @param deviceList: List of DRM devices (can be a single-item list) """ memoryUse = c_uint64() + avgMemBandwidth = c_uint16() printLogSpacer(' Current Memory Use ') for device in deviceList: ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse)) @@ -2171,6 +2172,12 @@ def showMemUse(deviceList): printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val) else: printLog(device, 'Memory Activity', 'N/A') + + ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth)) + if rsmi_ret_ok(ret, device, silent=True): + printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index ed96a7ee65..bfe427fcba 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -3295,7 +3295,7 @@ rsmi_utilization_count_get(uint32_t dv_ind, val_ui32 = gpu_metrics.gfx_activity_acc; break; case RSMI_COARSE_GRAIN_MEM_ACTIVITY: - val_ui32 = gpu_metrics.mem_actvity_acc; + val_ui32 = gpu_metrics.mem_activity_acc; break; default: return RSMI_STATUS_INVALID_ARGS; @@ -3312,6 +3312,119 @@ rsmi_utilization_count_get(uint32_t dv_ind, CATCH } +rsmi_status_t +rsmi_dev_activity_metric_get(uint32_t dv_ind, + rsmi_activity_metric_t activity_metric_type, + rsmi_activity_metric_counter_t* activity_metric_counter) { + + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + if (!activity_metric_counter) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Cause: rsmi_activity_metric_counter_t was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ostrstream); + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + rsmi_gpu_metrics_t gpu_metrics; + status_code = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(status_code) + << " | Returning = " + << status_code << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_GFX) { + activity_metric_counter->average_gfx_activity = gpu_metrics.average_gfx_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For GFX: " << activity_metric_counter->average_gfx_activity; + LOG_INFO(ostrstream); + } + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_UMC) { + activity_metric_counter->average_umc_activity = gpu_metrics.average_umc_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For UMC: " << activity_metric_counter->average_umc_activity; + LOG_INFO(ostrstream); + } + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_MM) { + activity_metric_counter->average_mm_activity = gpu_metrics.average_mm_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For MM: " << activity_metric_counter->average_mm_activity; + LOG_INFO(ostrstream); + } + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Returning = " + << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity) { + + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + if (!avg_activity) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM + << " | Cause: avg_activity was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ostrstream); + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + auto avg_mm_activity(uint16_t(0)); + rsmi_activity_metric_counter_t activity_metric_counter; + status_code = rsmi_dev_activity_metric_get(dv_ind, rsmi_activity_metric_t::RSMI_ACTIVITY_MM, &activity_metric_counter); + avg_activity = &activity_metric_counter.average_mm_activity; + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM + << " | Returning = " + << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + + rsmi_status_t rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) { TRY diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index c2ad2e2659..d7aab133c3 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -288,8 +288,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, rsmi_gpu_metrics->gfx_activity_acc, "rsmi_gpu_metrics->gfx_activity_acc") << print_unsigned_hex_and_int( - rsmi_gpu_metrics->mem_actvity_acc, - "rsmi_gpu_metrics->mem_actvity_acc"); + rsmi_gpu_metrics->mem_activity_acc, + "rsmi_gpu_metrics->mem_activity_acc"); for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) { ss << print_unsigned_hex_and_int( rsmi_gpu_metrics->temperature_hbm[i], @@ -414,7 +414,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, // These fields didn't exist in v0 data->gfx_activity_acc = 0; - data->mem_actvity_acc = 0; + data->mem_activity_acc = 0; (void)memset(data->temperature_hbm, 0, RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t)); } // else handle other conversions to format 1 diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc index a1b362fc31..f7944ddcbf 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -177,8 +177,8 @@ void TestGpuMetricsRead::Run(void) { << std::to_string(smu.pcie_link_speed) << '\n'; std::cout << "gfx_activity_acc=" << std::dec << smu.gfx_activity_acc << '\n'; - std::cout << "mem_actvity_acc=" - << std::dec << smu.mem_actvity_acc << '\n'; + std::cout << "mem_activity_acc=" + << std::dec << smu.mem_activity_acc << '\n'; for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) { std::cout << "temperature_hbm[" << i << "]=" << std::dec <<