[SWDEV-463406] Update sample rate + align metric output

Changes:
- Corrected max speed users can sample from FW/driver
  is 100 ms
- Added warning to amdsmi_get_violation_status()
  call on delay required 100ms to sample
- Removed guest support, this API will not be supported
- Updated CLI `amd-smi metric --throttle` outputs from
    XXX_active -> XXX_status
    XXX_percent -> XXX_activity
  to align with host
- Changelog updated

Change-Id: Ib30dd35dcc04ff67904ca82c86a55a16689df226
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 0ceca28f41]
This commit is contained in:
Charis Poag
2024-10-23 13:46:07 -05:00
committed by Maisam Arif
parent 118ce35c67
commit 6e0b0792ab
4 changed files with 65 additions and 56 deletions
+29 -26
View File
@@ -34,41 +34,44 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to
$ amd-smi metric --throttle
GPU: 0
THROTTLE:
ACCUMULATION_COUNTER: 1226415116
ACCUMULATION_COUNTER: 3808991
PROCHOT_ACCUMULATED: 0
PPT_ACCUMULATED: 12
SOCKET_THERMAL_ACCUMULATED: 0
PPT_ACCUMULATED: 585613
SOCKET_THERMAL_ACCUMULATED: 2190
VR_THERMAL_ACCUMULATED: 0
HBM_THERMAL_ACCUMULATED: 0
PROCHOT_VIOLATION_ACTIVE: NOT ACTIVE
PPT_VIOLATION_ACTIVE: NOT ACTIVE
SOCKET_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
VR_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
HBM_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
PROCHOT_VIOLATION_PERCENT: 0 %
PPT_VIOLATION_PERCENT: 0 %
SOCKET_THERMAL_VIOLATION_PERCENT: 0 %
VR_THERMAL_VIOLATION_PERCENT: 0 %
HBM_THERMAL_VIOLATION_PERCENT: 0 %
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
PPT_VIOLATION_STATUS: NOT ACTIVE
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
PROCHOT_VIOLATION_ACTIVITY: 0 %
PPT_VIOLATION_ACTIVITY: 0 %
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
GPU: 1
THROTTLE:
ACCUMULATION_COUNTER: 1226415121
ACCUMULATION_COUNTER: 3806335
PROCHOT_ACCUMULATED: 0
PPT_ACCUMULATED: 12
SOCKET_THERMAL_ACCUMULATED: 0
PPT_ACCUMULATED: 586332
SOCKET_THERMAL_ACCUMULATED: 18010
VR_THERMAL_ACCUMULATED: 0
HBM_THERMAL_ACCUMULATED: 0
PROCHOT_VIOLATION_ACTIVE: NOT ACTIVE
PPT_VIOLATION_ACTIVE: NOT ACTIVE
SOCKET_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
VR_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
HBM_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
PROCHOT_VIOLATION_PERCENT: 0 %
PPT_VIOLATION_PERCENT: 0 %
SOCKET_THERMAL_VIOLATION_PERCENT: 0 %
VR_THERMAL_VIOLATION_PERCENT: 0 %
HBM_THERMAL_VIOLATION_PERCENT: 0 %
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
PPT_VIOLATION_STATUS: NOT ACTIVE
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
PROCHOT_VIOLATION_ACTIVITY: 0 %
PPT_VIOLATION_ACTIVITY: 0 %
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
...
```
+25 -25
View File
@@ -2096,7 +2096,7 @@ class AMDSMICommands():
if "throttle" in current_platform_args:
if args.throttle:
throttle_status = {
# violation status values - counter/accumulated
# Current values - counter/accumulated
'accumulation_counter': "N/A",
'prochot_accumulated': "N/A",
'ppt_accumulated': "N/A",
@@ -2104,19 +2104,19 @@ class AMDSMICommands():
'vr_thermal_accumulated': "N/A",
'hbm_thermal_accumulated': "N/A",
# violation status values - active
'prochot_violation_active': "N/A",
'ppt_violation_active': "N/A",
'socket_thermal_violation_active': "N/A",
'vr_thermal_violation_active': "N/A",
'hbm_thermal_violation_active': "N/A",
# violation status values - active/not active
'prochot_violation_status': "N/A",
'ppt_violation_status': "N/A",
'socket_thermal_violation_status': "N/A",
'vr_thermal_violation_status': "N/A",
'hbm_thermal_violation_status': "N/A",
# violation status values - percent
'prochot_violation_percent': "N/A",
'ppt_violation_percent': "N/A",
'socket_thermal_violation_percent': "N/A",
'vr_thermal_violation_percent': "N/A",
'hbm_thermal_violation_percent': "N/A"
# violation activity values - percent
'prochot_violation_activity': "N/A",
'ppt_violation_activity': "N/A",
'socket_thermal_violation_activity': "N/A",
'vr_thermal_violation_activity': "N/A",
'hbm_thermal_violation_activity': "N/A"
}
try:
@@ -2128,31 +2128,31 @@ class AMDSMICommands():
throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm']
throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm']
throttle_status['prochot_violation_active'] = violation_status['active_prochot_thrm']
throttle_status['ppt_violation_active'] = violation_status['active_ppt_pwr']
throttle_status['socket_thermal_violation_active'] = violation_status['active_socket_thrm']
throttle_status['vr_thermal_violation_active'] = violation_status['active_vr_thrm']
throttle_status['hbm_thermal_violation_active'] = violation_status['active_hbm_thrm']
throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm']
throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr']
throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm']
throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm']
throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm']
throttle_status['prochot_violation_percent'] = violation_status['per_prochot_thrm']
throttle_status['ppt_violation_percent'] = violation_status['per_ppt_pwr']
throttle_status['socket_thermal_violation_percent'] = violation_status['per_socket_thrm']
throttle_status['vr_thermal_violation_percent'] = violation_status['per_vr_thrm']
throttle_status['hbm_thermal_violation_percent'] = violation_status['per_hbm_thrm']
throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm']
throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr']
throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm']
throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm']
throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm']
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['throttle'] = throttle_status
logging.debug("Failed to get violation status' for gpu %s | %s", gpu_id, e.get_error_info())
for key, value in throttle_status.items():
if "active" in key:
if "_status" in key:
if value is True:
throttle_status[key] = "ACTIVE"
elif value is False:
throttle_status[key] = "NOT ACTIVE"
continue
if "percent" not in key:
if "_activity" not in key:
continue
activity_unit = '%'
+7 -3
View File
@@ -5099,12 +5099,16 @@ amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_
/**
* @brief Returns the violations for a processor
*
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf}
* Warning: API will be slow due to polling driver for 2 samples. Require
* a minimum wait of 100ms between the 2 samples in order to calculate. Otherwise
* users would need to use amdsmi_get_gpu_metrics_info for BM. See that API's struct
* for calculations.
*
* @platform{gpu_bm_linux} @platform{host}
*
* @param[in] processor_handle Device which to query
*
*
* @param[in,out] info Reference to all violation status details available.
* @param[out] info Reference to all violation status details available.
* Must be allocated by user.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
+4 -2
View File
@@ -632,8 +632,10 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
if (violation_status == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// 1 sec = 1000 ms = 1000000 us
constexpr uint64_t kFASTEST_POLL_TIME_MS = 1; // fastest SMU FW sample time is 1ms
// 0.1 sec = 100 ms = 100000 us
constexpr uint64_t kFASTEST_POLL_TIME_MS = 100; // fastest SMU FW sample time is 100 ms
violation_status->reference_timestamp = std::numeric_limits<uint64_t>::max();
violation_status->violation_timestamp = std::numeric_limits<uint64_t>::max();
@@ -700,7 +702,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// wait 1ms before reading again
// wait 100ms before reading again
system_wait(static_cast<int>(kFASTEST_POLL_TIME_MS));
amdsmi_gpu_metrics_t metric_info_b = {};