[SWDEV-463406] Update sample rate + align metric output
Changes:
- Corrected max speed users can sample from FW/driver
is 100 ms
- Added warning to amdsmi_get_violation_status()
call on delay required 100ms to sample
- Removed guest support, this API will not be supported
- Updated CLI `amd-smi metric --throttle` outputs from
XXX_active -> XXX_status
XXX_percent -> XXX_activity
to align with host
- Changelog updated
Change-Id: Ib30dd35dcc04ff67904ca82c86a55a16689df226
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
This commit is contained in:
committed by
Maisam Arif
orang tua
00b3184e9f
melakukan
0ceca28f41
+29
-26
@@ -34,41 +34,44 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to
|
||||
$ amd-smi metric --throttle
|
||||
GPU: 0
|
||||
THROTTLE:
|
||||
ACCUMULATION_COUNTER: 1226415116
|
||||
ACCUMULATION_COUNTER: 3808991
|
||||
PROCHOT_ACCUMULATED: 0
|
||||
PPT_ACCUMULATED: 12
|
||||
SOCKET_THERMAL_ACCUMULATED: 0
|
||||
PPT_ACCUMULATED: 585613
|
||||
SOCKET_THERMAL_ACCUMULATED: 2190
|
||||
VR_THERMAL_ACCUMULATED: 0
|
||||
HBM_THERMAL_ACCUMULATED: 0
|
||||
PROCHOT_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
PPT_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
SOCKET_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
VR_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
HBM_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
PROCHOT_VIOLATION_PERCENT: 0 %
|
||||
PPT_VIOLATION_PERCENT: 0 %
|
||||
SOCKET_THERMAL_VIOLATION_PERCENT: 0 %
|
||||
VR_THERMAL_VIOLATION_PERCENT: 0 %
|
||||
HBM_THERMAL_VIOLATION_PERCENT: 0 %
|
||||
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
|
||||
PPT_VIOLATION_STATUS: NOT ACTIVE
|
||||
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
PROCHOT_VIOLATION_ACTIVITY: 0 %
|
||||
PPT_VIOLATION_ACTIVITY: 0 %
|
||||
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
|
||||
|
||||
|
||||
GPU: 1
|
||||
THROTTLE:
|
||||
ACCUMULATION_COUNTER: 1226415121
|
||||
ACCUMULATION_COUNTER: 3806335
|
||||
PROCHOT_ACCUMULATED: 0
|
||||
PPT_ACCUMULATED: 12
|
||||
SOCKET_THERMAL_ACCUMULATED: 0
|
||||
PPT_ACCUMULATED: 586332
|
||||
SOCKET_THERMAL_ACCUMULATED: 18010
|
||||
VR_THERMAL_ACCUMULATED: 0
|
||||
HBM_THERMAL_ACCUMULATED: 0
|
||||
PROCHOT_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
PPT_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
SOCKET_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
VR_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
HBM_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE
|
||||
PROCHOT_VIOLATION_PERCENT: 0 %
|
||||
PPT_VIOLATION_PERCENT: 0 %
|
||||
SOCKET_THERMAL_VIOLATION_PERCENT: 0 %
|
||||
VR_THERMAL_VIOLATION_PERCENT: 0 %
|
||||
HBM_THERMAL_VIOLATION_PERCENT: 0 %
|
||||
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
|
||||
PPT_VIOLATION_STATUS: NOT ACTIVE
|
||||
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
PROCHOT_VIOLATION_ACTIVITY: 0 %
|
||||
PPT_VIOLATION_ACTIVITY: 0 %
|
||||
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
@@ -2096,7 +2096,7 @@ class AMDSMICommands():
|
||||
if "throttle" in current_platform_args:
|
||||
if args.throttle:
|
||||
throttle_status = {
|
||||
# violation status values - counter/accumulated
|
||||
# Current values - counter/accumulated
|
||||
'accumulation_counter': "N/A",
|
||||
'prochot_accumulated': "N/A",
|
||||
'ppt_accumulated': "N/A",
|
||||
@@ -2104,19 +2104,19 @@ class AMDSMICommands():
|
||||
'vr_thermal_accumulated': "N/A",
|
||||
'hbm_thermal_accumulated': "N/A",
|
||||
|
||||
# violation status values - active
|
||||
'prochot_violation_active': "N/A",
|
||||
'ppt_violation_active': "N/A",
|
||||
'socket_thermal_violation_active': "N/A",
|
||||
'vr_thermal_violation_active': "N/A",
|
||||
'hbm_thermal_violation_active': "N/A",
|
||||
# violation status values - active/not active
|
||||
'prochot_violation_status': "N/A",
|
||||
'ppt_violation_status': "N/A",
|
||||
'socket_thermal_violation_status': "N/A",
|
||||
'vr_thermal_violation_status': "N/A",
|
||||
'hbm_thermal_violation_status': "N/A",
|
||||
|
||||
# violation status values - percent
|
||||
'prochot_violation_percent': "N/A",
|
||||
'ppt_violation_percent': "N/A",
|
||||
'socket_thermal_violation_percent': "N/A",
|
||||
'vr_thermal_violation_percent': "N/A",
|
||||
'hbm_thermal_violation_percent': "N/A"
|
||||
# violation activity values - percent
|
||||
'prochot_violation_activity': "N/A",
|
||||
'ppt_violation_activity': "N/A",
|
||||
'socket_thermal_violation_activity': "N/A",
|
||||
'vr_thermal_violation_activity': "N/A",
|
||||
'hbm_thermal_violation_activity': "N/A"
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -2128,31 +2128,31 @@ class AMDSMICommands():
|
||||
throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm']
|
||||
throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm']
|
||||
|
||||
throttle_status['prochot_violation_active'] = violation_status['active_prochot_thrm']
|
||||
throttle_status['ppt_violation_active'] = violation_status['active_ppt_pwr']
|
||||
throttle_status['socket_thermal_violation_active'] = violation_status['active_socket_thrm']
|
||||
throttle_status['vr_thermal_violation_active'] = violation_status['active_vr_thrm']
|
||||
throttle_status['hbm_thermal_violation_active'] = violation_status['active_hbm_thrm']
|
||||
throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm']
|
||||
throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr']
|
||||
throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm']
|
||||
throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm']
|
||||
throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm']
|
||||
|
||||
throttle_status['prochot_violation_percent'] = violation_status['per_prochot_thrm']
|
||||
throttle_status['ppt_violation_percent'] = violation_status['per_ppt_pwr']
|
||||
throttle_status['socket_thermal_violation_percent'] = violation_status['per_socket_thrm']
|
||||
throttle_status['vr_thermal_violation_percent'] = violation_status['per_vr_thrm']
|
||||
throttle_status['hbm_thermal_violation_percent'] = violation_status['per_hbm_thrm']
|
||||
throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm']
|
||||
throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr']
|
||||
throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm']
|
||||
throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm']
|
||||
throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm']
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['throttle'] = throttle_status
|
||||
logging.debug("Failed to get violation status' for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
for key, value in throttle_status.items():
|
||||
if "active" in key:
|
||||
if "_status" in key:
|
||||
if value is True:
|
||||
throttle_status[key] = "ACTIVE"
|
||||
elif value is False:
|
||||
throttle_status[key] = "NOT ACTIVE"
|
||||
continue
|
||||
|
||||
if "percent" not in key:
|
||||
if "_activity" not in key:
|
||||
continue
|
||||
|
||||
activity_unit = '%'
|
||||
|
||||
@@ -5099,12 +5099,16 @@ amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_
|
||||
/**
|
||||
* @brief Returns the violations for a processor
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf}
|
||||
* Warning: API will be slow due to polling driver for 2 samples. Require
|
||||
* a minimum wait of 100ms between the 2 samples in order to calculate. Otherwise
|
||||
* users would need to use amdsmi_get_gpu_metrics_info for BM. See that API's struct
|
||||
* for calculations.
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @param[in] processor_handle Device which to query
|
||||
*
|
||||
*
|
||||
* @param[in,out] info Reference to all violation status details available.
|
||||
* @param[out] info Reference to all violation status details available.
|
||||
* Must be allocated by user.
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
|
||||
@@ -632,8 +632,10 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
if (violation_status == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
// 1 sec = 1000 ms = 1000000 us
|
||||
constexpr uint64_t kFASTEST_POLL_TIME_MS = 1; // fastest SMU FW sample time is 1ms
|
||||
// 0.1 sec = 100 ms = 100000 us
|
||||
constexpr uint64_t kFASTEST_POLL_TIME_MS = 100; // fastest SMU FW sample time is 100 ms
|
||||
|
||||
violation_status->reference_timestamp = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->violation_timestamp = std::numeric_limits<uint64_t>::max();
|
||||
@@ -700,7 +702,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// wait 1ms before reading again
|
||||
// wait 100ms before reading again
|
||||
system_wait(static_cast<int>(kFASTEST_POLL_TIME_MS));
|
||||
|
||||
amdsmi_gpu_metrics_t metric_info_b = {};
|
||||
|
||||
Reference in New Issue
Block a user