diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index e0222b27c7..45c7470b2b 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -34,41 +34,44 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to $ amd-smi metric --throttle GPU: 0 THROTTLE: - ACCUMULATION_COUNTER: 1226415116 + ACCUMULATION_COUNTER: 3808991 PROCHOT_ACCUMULATED: 0 - PPT_ACCUMULATED: 12 - SOCKET_THERMAL_ACCUMULATED: 0 + PPT_ACCUMULATED: 585613 + SOCKET_THERMAL_ACCUMULATED: 2190 VR_THERMAL_ACCUMULATED: 0 HBM_THERMAL_ACCUMULATED: 0 - PROCHOT_VIOLATION_ACTIVE: NOT ACTIVE - PPT_VIOLATION_ACTIVE: NOT ACTIVE - SOCKET_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE - VR_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE - HBM_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE - PROCHOT_VIOLATION_PERCENT: 0 % - PPT_VIOLATION_PERCENT: 0 % - SOCKET_THERMAL_VIOLATION_PERCENT: 0 % - VR_THERMAL_VIOLATION_PERCENT: 0 % - HBM_THERMAL_VIOLATION_PERCENT: 0 % + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + + GPU: 1 THROTTLE: - ACCUMULATION_COUNTER: 1226415121 + ACCUMULATION_COUNTER: 3806335 PROCHOT_ACCUMULATED: 0 - PPT_ACCUMULATED: 12 - SOCKET_THERMAL_ACCUMULATED: 0 + PPT_ACCUMULATED: 586332 + SOCKET_THERMAL_ACCUMULATED: 18010 VR_THERMAL_ACCUMULATED: 0 HBM_THERMAL_ACCUMULATED: 0 - PROCHOT_VIOLATION_ACTIVE: NOT ACTIVE - PPT_VIOLATION_ACTIVE: NOT ACTIVE - SOCKET_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE - VR_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE - HBM_THERMAL_VIOLATION_ACTIVE: NOT ACTIVE - PROCHOT_VIOLATION_PERCENT: 0 % - PPT_VIOLATION_PERCENT: 0 % - SOCKET_THERMAL_VIOLATION_PERCENT: 0 % - VR_THERMAL_VIOLATION_PERCENT: 0 % - HBM_THERMAL_VIOLATION_PERCENT: 0 % + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + ... ``` diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 53f777f56d..74735cb5e8 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -2096,7 +2096,7 @@ class AMDSMICommands(): if "throttle" in current_platform_args: if args.throttle: throttle_status = { - # violation status values - counter/accumulated + # Current values - counter/accumulated 'accumulation_counter': "N/A", 'prochot_accumulated': "N/A", 'ppt_accumulated': "N/A", @@ -2104,19 +2104,19 @@ class AMDSMICommands(): 'vr_thermal_accumulated': "N/A", 'hbm_thermal_accumulated': "N/A", - # violation status values - active - 'prochot_violation_active': "N/A", - 'ppt_violation_active': "N/A", - 'socket_thermal_violation_active': "N/A", - 'vr_thermal_violation_active': "N/A", - 'hbm_thermal_violation_active': "N/A", + # violation status values - active/not active + 'prochot_violation_status': "N/A", + 'ppt_violation_status': "N/A", + 'socket_thermal_violation_status': "N/A", + 'vr_thermal_violation_status': "N/A", + 'hbm_thermal_violation_status': "N/A", - # violation status values - percent - 'prochot_violation_percent': "N/A", - 'ppt_violation_percent': "N/A", - 'socket_thermal_violation_percent': "N/A", - 'vr_thermal_violation_percent': "N/A", - 'hbm_thermal_violation_percent': "N/A" + # violation activity values - percent + 'prochot_violation_activity': "N/A", + 'ppt_violation_activity': "N/A", + 'socket_thermal_violation_activity': "N/A", + 'vr_thermal_violation_activity': "N/A", + 'hbm_thermal_violation_activity': "N/A" } try: @@ -2128,31 +2128,31 @@ class AMDSMICommands(): throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm'] throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm'] - throttle_status['prochot_violation_active'] = violation_status['active_prochot_thrm'] - throttle_status['ppt_violation_active'] = violation_status['active_ppt_pwr'] - throttle_status['socket_thermal_violation_active'] = violation_status['active_socket_thrm'] - throttle_status['vr_thermal_violation_active'] = violation_status['active_vr_thrm'] - throttle_status['hbm_thermal_violation_active'] = violation_status['active_hbm_thrm'] + throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm'] + throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr'] + throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm'] + throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm'] + throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm'] - throttle_status['prochot_violation_percent'] = violation_status['per_prochot_thrm'] - throttle_status['ppt_violation_percent'] = violation_status['per_ppt_pwr'] - throttle_status['socket_thermal_violation_percent'] = violation_status['per_socket_thrm'] - throttle_status['vr_thermal_violation_percent'] = violation_status['per_vr_thrm'] - throttle_status['hbm_thermal_violation_percent'] = violation_status['per_hbm_thrm'] + throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm'] + throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr'] + throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm'] + throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm'] + throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm'] except amdsmi_exception.AmdSmiLibraryException as e: values_dict['throttle'] = throttle_status logging.debug("Failed to get violation status' for gpu %s | %s", gpu_id, e.get_error_info()) for key, value in throttle_status.items(): - if "active" in key: + if "_status" in key: if value is True: throttle_status[key] = "ACTIVE" elif value is False: throttle_status[key] = "NOT ACTIVE" continue - if "percent" not in key: + if "_activity" not in key: continue activity_unit = '%' diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 9aad19603b..21ef4b077e 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -5099,12 +5099,16 @@ amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_ /** * @brief Returns the violations for a processor * - * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} + * Warning: API will be slow due to polling driver for 2 samples. Require + * a minimum wait of 100ms between the 2 samples in order to calculate. Otherwise + * users would need to use amdsmi_get_gpu_metrics_info for BM. See that API's struct + * for calculations. + * + * @platform{gpu_bm_linux} @platform{host} * * @param[in] processor_handle Device which to query * - * - * @param[in,out] info Reference to all violation status details available. + * @param[out] info Reference to all violation status details available. * Must be allocated by user. * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 60f0ae0d2f..36d805fa98 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -632,8 +632,10 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha if (violation_status == nullptr) { return AMDSMI_STATUS_INVAL; } + // 1 sec = 1000 ms = 1000000 us - constexpr uint64_t kFASTEST_POLL_TIME_MS = 1; // fastest SMU FW sample time is 1ms + // 0.1 sec = 100 ms = 100000 us + constexpr uint64_t kFASTEST_POLL_TIME_MS = 100; // fastest SMU FW sample time is 100 ms violation_status->reference_timestamp = std::numeric_limits::max(); violation_status->violation_timestamp = std::numeric_limits::max(); @@ -700,7 +702,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha return AMDSMI_STATUS_NOT_SUPPORTED; } - // wait 1ms before reading again + // wait 100ms before reading again system_wait(static_cast(kFASTEST_POLL_TIME_MS)); amdsmi_gpu_metrics_t metric_info_b = {};