rocm_smi_lib: Fix [linux BM] [AMDSMI] Memory Bandwidth

Implements APIs for 'gpu_metrics_v1_3' utilization averages

Code changes related to the following:
  * rsmi_dev_activity_metric_get()
  * rsmi_dev_activity_avg_mm_get()
  * CLI shows "Avg.Memory Bandwidth" under "--showmemuse"

Change-Id: I8e4600f350a7c18499abf022534db2b875f09d5f
Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
This commit is contained in:
Oliveira, Daniel
2023-09-19 13:44:20 -05:00
zatwierdzone przez Daniel Oliveira
rodzic b99867eb80
commit e0483f2ee2
5 zmienionych plików z 202 dodań i 7 usunięć
+76 -1
Wyświetl plik
@@ -466,6 +466,19 @@ typedef enum {
RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} rsmi_temperature_type_t;
/**
* @brief Activity (Utilization) Metrics. This enum is used to identify
* various activity metrics.
*
*/
typedef enum {
/* Utilization */
RSMI_ACTIVITY_GFX = (0x1 << 0),
RSMI_ACTIVITY_UMC = (0x1 << 1), //!< memory controller
RSMI_ACTIVITY_MM = (0x1 << 2) //!< UVD or VCN
} rsmi_activity_metric_t;
/**
* @brief Voltage Metrics. This enum is used to identify various
* Volatge metrics. Corresponding values will be in millivolt.
@@ -774,6 +787,17 @@ typedef struct {
typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth;
/// \endcond
/**
* @brief This structure holds information about the possible activity
* averages. Specifically, the utilization counters.
*/
typedef struct {
/* Utilization */
uint16_t average_gfx_activity;
uint16_t average_umc_activity; //!< memory controller
uint16_t average_mm_activity; //!< UVD or VCN
} rsmi_activity_metric_counter_t;
/**
* @brief This structure holds version information.
*/
@@ -964,7 +988,7 @@ typedef struct {
uint16_t padding; // new in v1
uint32_t gfx_activity_acc; // new in v1
uint32_t mem_actvity_acc; // new in v1
uint32_t mem_activity_acc; // new in v1
uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1
/// \endcond
} rsmi_gpu_metrics_t;
@@ -2259,6 +2283,57 @@ rsmi_utilization_count_get(uint32_t dv_ind,
uint32_t count,
uint64_t *timestamp);
/**
* @brief Get activity metric average utilization counter of the specified device
*
* @details Given a device index @p dv_ind, the activity metric type,
* this function returns the requested utilization counters
*
* @param[in] dv_ind a device index
*
* @param[in] activity_metric_type a metric type
*
* @param[inout] activity_metric_counter Multiple utilization counters can be retrieved with a single
* call. The caller must allocate enough space to the rsmi_activity_metric_counter_t structure.
*
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
* field of the counter will be set in the value field of
* the activity_metric_counter_t.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t
rsmi_dev_activity_metric_get(uint32_t dv_ind,
rsmi_activity_metric_t activity_metric_type,
rsmi_activity_metric_counter_t* activity_metric_counter);
/**
* @brief Get activity metric bandwidth average utilization counter of the specified device
*
* @details Given a device index @p dv_ind, the activity metric type,
* this function returns the requested utilization counters
*
* @param[in] dv_ind a device index
*
* @param[inout] avg_activity average bandwidth utilization counters can be retrieved
*
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
* field of the counter will be set in the value field of
* the activity_metric_counter_t.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t
rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity);
/**
* @brief Get the performance level of the device with provided
* device index.
+7
Wyświetl plik
@@ -2160,6 +2160,7 @@ def showMemUse(deviceList):
@param deviceList: List of DRM devices (can be a single-item list)
"""
memoryUse = c_uint64()
avgMemBandwidth = c_uint16()
printLogSpacer(' Current Memory Use ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
@@ -2171,6 +2172,12 @@ def showMemUse(deviceList):
printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val)
else:
printLog(device, 'Memory Activity', 'N/A')
ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth))
if rsmi_ret_ok(ret, device, silent=True):
printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value)
else:
printLog(device, 'Not supported on the given system', None)
printLogSpacer()
+114 -1
Wyświetl plik
@@ -3295,7 +3295,7 @@ rsmi_utilization_count_get(uint32_t dv_ind,
val_ui32 = gpu_metrics.gfx_activity_acc;
break;
case RSMI_COARSE_GRAIN_MEM_ACTIVITY:
val_ui32 = gpu_metrics.mem_actvity_acc;
val_ui32 = gpu_metrics.mem_activity_acc;
break;
default:
return RSMI_STATUS_INVALID_ARGS;
@@ -3312,6 +3312,119 @@ rsmi_utilization_count_get(uint32_t dv_ind,
CATCH
}
rsmi_status_t
rsmi_dev_activity_metric_get(uint32_t dv_ind,
rsmi_activity_metric_t activity_metric_type,
rsmi_activity_metric_counter_t* activity_metric_counter) {
TRY
std::ostringstream ostrstream;
ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ostrstream);
if (!activity_metric_counter) {
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << activity_metric_type
<< " | Cause: rsmi_activity_metric_counter_t was a null ptr reference"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
LOG_ERROR(ostrstream);
return rsmi_status_t::RSMI_STATUS_INVALID_ARGS;
}
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
rsmi_gpu_metrics_t gpu_metrics;
status_code = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << activity_metric_type
<< " | Cause: rsmi_dev_gpu_metrics_info_get returned "
<< getRSMIStatusString(status_code)
<< " | Returning = "
<< status_code << " |";
LOG_ERROR(ostrstream);
return status_code;
}
if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_GFX) {
activity_metric_counter->average_gfx_activity = gpu_metrics.average_gfx_activity;
ostrstream << __PRETTY_FUNCTION__
<< " | For GFX: " << activity_metric_counter->average_gfx_activity;
LOG_INFO(ostrstream);
}
if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_UMC) {
activity_metric_counter->average_umc_activity = gpu_metrics.average_umc_activity;
ostrstream << __PRETTY_FUNCTION__
<< " | For UMC: " << activity_metric_counter->average_umc_activity;
LOG_INFO(ostrstream);
}
if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_MM) {
activity_metric_counter->average_mm_activity = gpu_metrics.average_mm_activity;
ostrstream << __PRETTY_FUNCTION__
<< " | For MM: " << activity_metric_counter->average_mm_activity;
LOG_INFO(ostrstream);
}
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << activity_metric_type
<< " | Returning = "
<< getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
return status_code;
CATCH
}
rsmi_status_t
rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity) {
TRY
std::ostringstream ostrstream;
ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ostrstream);
if (!avg_activity) {
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM
<< " | Cause: avg_activity was a null ptr reference"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
LOG_ERROR(ostrstream);
return rsmi_status_t::RSMI_STATUS_INVALID_ARGS;
}
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
auto avg_mm_activity(uint16_t(0));
rsmi_activity_metric_counter_t activity_metric_counter;
status_code = rsmi_dev_activity_metric_get(dv_ind, rsmi_activity_metric_t::RSMI_ACTIVITY_MM, &activity_metric_counter);
avg_activity = &activity_metric_counter.average_mm_activity;
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM
<< " | Returning = "
<< getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
return status_code;
CATCH
}
rsmi_status_t
rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) {
TRY
+3 -3
Wyświetl plik
@@ -288,8 +288,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
rsmi_gpu_metrics->gfx_activity_acc,
"rsmi_gpu_metrics->gfx_activity_acc")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->mem_actvity_acc,
"rsmi_gpu_metrics->mem_actvity_acc");
rsmi_gpu_metrics->mem_activity_acc,
"rsmi_gpu_metrics->mem_activity_acc");
for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) {
ss << print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_hbm[i],
@@ -414,7 +414,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
// These fields didn't exist in v0
data->gfx_activity_acc = 0;
data->mem_actvity_acc = 0;
data->mem_activity_acc = 0;
(void)memset(data->temperature_hbm, 0,
RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t));
} // else handle other conversions to format 1
@@ -177,8 +177,8 @@ void TestGpuMetricsRead::Run(void) {
<< std::to_string(smu.pcie_link_speed) << '\n';
std::cout << "gfx_activity_acc="
<< std::dec << smu.gfx_activity_acc << '\n';
std::cout << "mem_actvity_acc="
<< std::dec << smu.mem_actvity_acc << '\n';
std::cout << "mem_activity_acc="
<< std::dec << smu.mem_activity_acc << '\n';
for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) {
std::cout << "temperature_hbm[" << i << "]=" << std::dec <<