rocm_smi_lib: Fix [linux BM] [AMDSMI] Memory Bandwidth
Implements APIs for 'gpu_metrics_v1_3' utilization averages Code changes related to the following: * rsmi_dev_activity_metric_get() * rsmi_dev_activity_avg_mm_get() * CLI shows "Avg.Memory Bandwidth" under "--showmemuse" Change-Id: I8e4600f350a7c18499abf022534db2b875f09d5f Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
This commit is contained in:
zatwierdzone przez
Daniel Oliveira
rodzic
b99867eb80
commit
e0483f2ee2
@@ -466,6 +466,19 @@ typedef enum {
|
||||
RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
|
||||
} rsmi_temperature_type_t;
|
||||
|
||||
/**
|
||||
* @brief Activity (Utilization) Metrics. This enum is used to identify
|
||||
* various activity metrics.
|
||||
*
|
||||
*/
|
||||
typedef enum {
|
||||
/* Utilization */
|
||||
RSMI_ACTIVITY_GFX = (0x1 << 0),
|
||||
RSMI_ACTIVITY_UMC = (0x1 << 1), //!< memory controller
|
||||
RSMI_ACTIVITY_MM = (0x1 << 2) //!< UVD or VCN
|
||||
} rsmi_activity_metric_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Voltage Metrics. This enum is used to identify various
|
||||
* Volatge metrics. Corresponding values will be in millivolt.
|
||||
@@ -774,6 +787,17 @@ typedef struct {
|
||||
typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth;
|
||||
/// \endcond
|
||||
|
||||
/**
|
||||
* @brief This structure holds information about the possible activity
|
||||
* averages. Specifically, the utilization counters.
|
||||
*/
|
||||
typedef struct {
|
||||
/* Utilization */
|
||||
uint16_t average_gfx_activity;
|
||||
uint16_t average_umc_activity; //!< memory controller
|
||||
uint16_t average_mm_activity; //!< UVD or VCN
|
||||
} rsmi_activity_metric_counter_t;
|
||||
|
||||
/**
|
||||
* @brief This structure holds version information.
|
||||
*/
|
||||
@@ -964,7 +988,7 @@ typedef struct {
|
||||
uint16_t padding; // new in v1
|
||||
|
||||
uint32_t gfx_activity_acc; // new in v1
|
||||
uint32_t mem_actvity_acc; // new in v1
|
||||
uint32_t mem_activity_acc; // new in v1
|
||||
uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1
|
||||
/// \endcond
|
||||
} rsmi_gpu_metrics_t;
|
||||
@@ -2259,6 +2283,57 @@ rsmi_utilization_count_get(uint32_t dv_ind,
|
||||
uint32_t count,
|
||||
uint64_t *timestamp);
|
||||
|
||||
/**
|
||||
* @brief Get activity metric average utilization counter of the specified device
|
||||
*
|
||||
* @details Given a device index @p dv_ind, the activity metric type,
|
||||
* this function returns the requested utilization counters
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[in] activity_metric_type a metric type
|
||||
*
|
||||
* @param[inout] activity_metric_counter Multiple utilization counters can be retrieved with a single
|
||||
* call. The caller must allocate enough space to the rsmi_activity_metric_counter_t structure.
|
||||
*
|
||||
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
|
||||
* field of the counter will be set in the value field of
|
||||
* the activity_metric_counter_t.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
rsmi_dev_activity_metric_get(uint32_t dv_ind,
|
||||
rsmi_activity_metric_t activity_metric_type,
|
||||
rsmi_activity_metric_counter_t* activity_metric_counter);
|
||||
|
||||
/**
|
||||
* @brief Get activity metric bandwidth average utilization counter of the specified device
|
||||
*
|
||||
* @details Given a device index @p dv_ind, the activity metric type,
|
||||
* this function returns the requested utilization counters
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] avg_activity average bandwidth utilization counters can be retrieved
|
||||
*
|
||||
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
|
||||
* field of the counter will be set in the value field of
|
||||
* the activity_metric_counter_t.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity);
|
||||
|
||||
/**
|
||||
* @brief Get the performance level of the device with provided
|
||||
* device index.
|
||||
|
||||
@@ -2160,6 +2160,7 @@ def showMemUse(deviceList):
|
||||
@param deviceList: List of DRM devices (can be a single-item list)
|
||||
"""
|
||||
memoryUse = c_uint64()
|
||||
avgMemBandwidth = c_uint16()
|
||||
printLogSpacer(' Current Memory Use ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
|
||||
@@ -2171,6 +2172,12 @@ def showMemUse(deviceList):
|
||||
printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val)
|
||||
else:
|
||||
printLog(device, 'Memory Activity', 'N/A')
|
||||
|
||||
ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth))
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value)
|
||||
else:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
|
||||
+114
-1
@@ -3295,7 +3295,7 @@ rsmi_utilization_count_get(uint32_t dv_ind,
|
||||
val_ui32 = gpu_metrics.gfx_activity_acc;
|
||||
break;
|
||||
case RSMI_COARSE_GRAIN_MEM_ACTIVITY:
|
||||
val_ui32 = gpu_metrics.mem_actvity_acc;
|
||||
val_ui32 = gpu_metrics.mem_activity_acc;
|
||||
break;
|
||||
default:
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
@@ -3312,6 +3312,119 @@ rsmi_utilization_count_get(uint32_t dv_ind,
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_activity_metric_get(uint32_t dv_ind,
|
||||
rsmi_activity_metric_t activity_metric_type,
|
||||
rsmi_activity_metric_counter_t* activity_metric_counter) {
|
||||
|
||||
TRY
|
||||
std::ostringstream ostrstream;
|
||||
ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
if (!activity_metric_counter) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << activity_metric_type
|
||||
<< " | Cause: rsmi_activity_metric_counter_t was a null ptr reference"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return rsmi_status_t::RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
rsmi_gpu_metrics_t gpu_metrics;
|
||||
status_code = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
|
||||
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << activity_metric_type
|
||||
<< " | Cause: rsmi_dev_gpu_metrics_info_get returned "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " | Returning = "
|
||||
<< status_code << " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_GFX) {
|
||||
activity_metric_counter->average_gfx_activity = gpu_metrics.average_gfx_activity;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | For GFX: " << activity_metric_counter->average_gfx_activity;
|
||||
LOG_INFO(ostrstream);
|
||||
}
|
||||
if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_UMC) {
|
||||
activity_metric_counter->average_umc_activity = gpu_metrics.average_umc_activity;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | For UMC: " << activity_metric_counter->average_umc_activity;
|
||||
LOG_INFO(ostrstream);
|
||||
}
|
||||
if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_MM) {
|
||||
activity_metric_counter->average_mm_activity = gpu_metrics.average_mm_activity;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | For MM: " << activity_metric_counter->average_mm_activity;
|
||||
LOG_INFO(ostrstream);
|
||||
}
|
||||
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << activity_metric_type
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity) {
|
||||
|
||||
TRY
|
||||
std::ostringstream ostrstream;
|
||||
ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
if (!avg_activity) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM
|
||||
<< " | Cause: avg_activity was a null ptr reference"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return rsmi_status_t::RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
auto avg_mm_activity(uint16_t(0));
|
||||
rsmi_activity_metric_counter_t activity_metric_counter;
|
||||
status_code = rsmi_dev_activity_metric_get(dv_ind, rsmi_activity_metric_t::RSMI_ACTIVITY_MM, &activity_metric_counter);
|
||||
avg_activity = &activity_metric_counter.average_mm_activity;
|
||||
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) {
|
||||
TRY
|
||||
|
||||
@@ -288,8 +288,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
|
||||
rsmi_gpu_metrics->gfx_activity_acc,
|
||||
"rsmi_gpu_metrics->gfx_activity_acc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->mem_actvity_acc,
|
||||
"rsmi_gpu_metrics->mem_actvity_acc");
|
||||
rsmi_gpu_metrics->mem_activity_acc,
|
||||
"rsmi_gpu_metrics->mem_activity_acc");
|
||||
for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) {
|
||||
ss << print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_hbm[i],
|
||||
@@ -414,7 +414,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
|
||||
|
||||
// These fields didn't exist in v0
|
||||
data->gfx_activity_acc = 0;
|
||||
data->mem_actvity_acc = 0;
|
||||
data->mem_activity_acc = 0;
|
||||
(void)memset(data->temperature_hbm, 0,
|
||||
RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t));
|
||||
} // else handle other conversions to format 1
|
||||
|
||||
@@ -177,8 +177,8 @@ void TestGpuMetricsRead::Run(void) {
|
||||
<< std::to_string(smu.pcie_link_speed) << '\n';
|
||||
std::cout << "gfx_activity_acc="
|
||||
<< std::dec << smu.gfx_activity_acc << '\n';
|
||||
std::cout << "mem_actvity_acc="
|
||||
<< std::dec << smu.mem_actvity_acc << '\n';
|
||||
std::cout << "mem_activity_acc="
|
||||
<< std::dec << smu.mem_activity_acc << '\n';
|
||||
|
||||
for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) {
|
||||
std::cout << "temperature_hbm[" << i << "]=" << std::dec <<
|
||||
|
||||
Reference in New Issue
Block a user