diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 058a31c088..2eb9fcd45a 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -2091,7 +2091,7 @@ class AMDSMICommands(): if "throttle" in current_platform_args: if args.throttle: throttle_status = { - # gpu metric values + # violation status values - counter/accumulated 'accumulation_counter': "N/A", 'prochot_accumulated': "N/A", 'ppt_accumulated': "N/A", @@ -2114,20 +2114,15 @@ class AMDSMICommands(): 'hbm_thermal_violation_percent': "N/A" } - try: - throttle_status['accumulation_counter'] = gpu_metric['accumulation_counter'] - throttle_status['prochot_accumulated'] = gpu_metric['prochot_residency_acc'] - throttle_status['ppt_accumulated'] = gpu_metric['ppt_residency_acc'] - throttle_status['socket_thermal_accumulated'] = gpu_metric['socket_thm_residency_acc'] - throttle_status['vr_thermal_accumulated'] = gpu_metric['vr_thm_residency_acc'] - throttle_status['hbm_thermal_accumulated'] = gpu_metric['hbm_thm_residency_acc'] - - except Exception as e: - values_dict['throttle'] = throttle_status - logging.debug("Failed to get gpu metric information for throttle status' for gpu %s | %s", gpu_id, e) - try: violation_status = amdsmi_interface.amdsmi_get_violation_status(args.gpu) + throttle_status['accumulation_counter'] = violation_status['acc_counter'] + throttle_status['prochot_accumulated'] = violation_status['acc_prochot_thrm'] + throttle_status['ppt_accumulated'] = violation_status['acc_ppt_pwr'] + throttle_status['socket_thermal_accumulated'] = violation_status['acc_socket_thrm'] + throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm'] + throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm'] + throttle_status['prochot_violation_active'] = violation_status['active_prochot_thrm'] throttle_status['ppt_violation_active'] = violation_status['active_ppt_pwr'] throttle_status['socket_thermal_violation_active'] = violation_status['active_socket_thrm'] diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 3c05386be3..0b11641daf 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -528,11 +528,16 @@ typedef struct { } amdsmi_vram_usage_t; /** * @brief This structure hold violation status information. - * Note: for MI3x asics and higher, older ASICs will show unsupported. */ typedef struct { uint64_t reference_timestamp; //!< Represents CPU timestamp in microseconds (uS) uint64_t violation_timestamp; //!< Violation time in milliseconds (ms) + uint64_t acc_counter; //!< Current accumulated counter; Max uint64 means unsupported + uint64_t acc_prochot_thrm; //!< Current accumulated processor hot violation count; Max uint64 means unsupported + uint64_t acc_ppt_pwr; //!< PVIOL; Current accumulated Package Power Tracking (PPT) count; Max uint64 means unsupported + uint64_t acc_socket_thrm; //!< TVIOL; Current accumulated Socket thermal count; Max uint64 means unsupported + uint64_t acc_vr_thrm; //!< Current accumulated voltage regulator count; Max uint64 means unsupported + uint64_t acc_hbm_thrm; //!< Current accumulated High Bandwidth Memory (HBM) thermal count; Max uint64 means unsupported uint64_t per_prochot_thrm; //!< Processor hot violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_ppt_pwr; //!< PVIOL; Package Power Tracking (PPT) violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_socket_thrm; //!< TVIOL; Socket thermal violation % (greater than 0% is a violation); Max uint64 means unsupported @@ -543,7 +548,7 @@ typedef struct { uint8_t active_socket_thrm; //!< Socket thermal violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_vr_thrm; //!< Voltage regulator violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation; 1 = active 0 = not active; Max uint8 means unsupported - uint64_t reserved[24]; // Reserved for new violation info + uint64_t reserved[30]; // Reserved for new violation info } amdsmi_violation_status_t; typedef struct { amdsmi_range_t supported_freq_range; diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index bed84e26a8..7cc37897b3 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2020,6 +2020,12 @@ def amdsmi_get_violation_status( return { "reference_timestamp": _validate_if_max_uint(violation_status.reference_timestamp, MaxUIntegerTypes.UINT64_T), "violation_timestamp": _validate_if_max_uint(violation_status.violation_timestamp, MaxUIntegerTypes.UINT64_T), + "acc_counter": _validate_if_max_uint(violation_status.acc_counter, MaxUIntegerTypes.UINT64_T), + "acc_prochot_thrm": _validate_if_max_uint(violation_status.acc_prochot_thrm, MaxUIntegerTypes.UINT64_T), + "acc_ppt_pwr": _validate_if_max_uint(violation_status.acc_ppt_pwr, MaxUIntegerTypes.UINT64_T), #PVIOL + "acc_socket_thrm": _validate_if_max_uint(violation_status.acc_socket_thrm, MaxUIntegerTypes.UINT64_T), #TVIOL + "acc_vr_thrm": _validate_if_max_uint(violation_status.acc_vr_thrm, MaxUIntegerTypes.UINT64_T), + "acc_hbm_thrm": _validate_if_max_uint(violation_status.acc_hbm_thrm, MaxUIntegerTypes.UINT64_T), "per_prochot_thrm": _validate_if_max_uint(violation_status.per_prochot_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_ppt_pwr": _validate_if_max_uint(violation_status.per_ppt_pwr, MaxUIntegerTypes.UINT64_T, isActivity=True), #PVIOL "per_socket_thrm": _validate_if_max_uint(violation_status.per_socket_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), #TVIOL diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index e68b0bc2aa..b16593187d 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -734,6 +734,12 @@ struct_amdsmi_violation_status_t._pack_ = 1 # source:False struct_amdsmi_violation_status_t._fields_ = [ ('reference_timestamp', ctypes.c_uint64), ('violation_timestamp', ctypes.c_uint64), + ('acc_counter', ctypes.c_uint64), + ('acc_prochot_thrm', ctypes.c_uint64), + ('acc_ppt_pwr', ctypes.c_uint64), + ('acc_socket_thrm', ctypes.c_uint64), + ('acc_vr_thrm', ctypes.c_uint64), + ('acc_hbm_thrm', ctypes.c_uint64), ('per_prochot_thrm', ctypes.c_uint64), ('per_ppt_pwr', ctypes.c_uint64), ('per_socket_thrm', ctypes.c_uint64), @@ -745,7 +751,7 @@ struct_amdsmi_violation_status_t._fields_ = [ ('active_vr_thrm', ctypes.c_ubyte), ('active_hbm_thrm', ctypes.c_ubyte), ('PADDING_0', ctypes.c_ubyte * 3), - ('reserved', ctypes.c_uint64 * 24), + ('reserved', ctypes.c_uint64 * 30), ] amdsmi_violation_status_t = struct_amdsmi_violation_status_t @@ -798,19 +804,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - class struct_pcie_metric_(Structure): pass @@ -831,6 +824,19 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 12), ] +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 4c25e43ad2..cb478122e5 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -637,6 +637,14 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha violation_status->reference_timestamp = std::numeric_limits::max(); violation_status->violation_timestamp = std::numeric_limits::max(); + + violation_status->acc_counter = std::numeric_limits::max(); + violation_status->acc_prochot_thrm = std::numeric_limits::max(); + violation_status->acc_ppt_pwr = std::numeric_limits::max(); + violation_status->acc_socket_thrm = std::numeric_limits::max(); + violation_status->acc_vr_thrm = std::numeric_limits::max(); + violation_status->acc_hbm_thrm = std::numeric_limits::max(); + violation_status->per_prochot_thrm = std::numeric_limits::max(); violation_status->per_ppt_pwr = std::numeric_limits::max(); violation_status->per_socket_thrm = std::numeric_limits::max(); @@ -702,6 +710,14 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha return status; } + // Insert current accumulator counters into struct + violation_status->acc_counter = metric_info_b.accumulation_counter; + violation_status->acc_prochot_thrm = metric_info_b.prochot_residency_acc; + violation_status->acc_ppt_pwr = metric_info_b.ppt_residency_acc; + violation_status->acc_socket_thrm = metric_info_b.socket_thm_residency_acc; + violation_status->acc_vr_thrm = metric_info_b.vr_thm_residency_acc; + violation_status->acc_hbm_thrm = metric_info_b.hbm_thm_residency_acc; + ss << __PRETTY_FUNCTION__ << " | " << "[gpu_metrics A] metric_info_a.accumulation_counter: " << std::dec << metric_info_a.accumulation_counter @@ -818,7 +834,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha } if ( (metric_info_b.hbm_thm_residency_acc != std::numeric_limits::max() || metric_info_a.hbm_thm_residency_acc != std::numeric_limits::max()) - && (metric_info_b.hbm_thm_residency_acc >= metric_info_a.vr_thm_residency_acc) + && (metric_info_b.hbm_thm_residency_acc >= metric_info_a.hbm_thm_residency_acc) && ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0) ) { violation_status->per_hbm_thrm = (((metric_info_b.hbm_thm_residency_acc - diff --git a/tests/python_unittest/integration_test.py b/tests/python_unittest/integration_test.py index bfae5b65f6..85d3c063a7 100755 --- a/tests/python_unittest/integration_test.py +++ b/tests/python_unittest/integration_test.py @@ -864,7 +864,8 @@ class TestAmdSmiPythonInterface(unittest.TestCase): print() self.tearDown() - # Only supported on MI300+ ASICs + # amdsmi_get_violation_status is only supported on MI300+ ASICs + # We should expect a not supported status for Navi / MI100 / MI2x ASICs @handle_exceptions def test_get_violation_status(self): self.setUp() @@ -882,6 +883,17 @@ class TestAmdSmiPythonInterface(unittest.TestCase): print(" Violation Timestamp: {}".format( violation_status['violation_timestamp'])) + print(" Current Prochot Thrm Accumulated (Count): {}".format( + violation_status['acc_prochot_thrm'])) + print(" Current PVIOL (acc_ppt_pwr) Accumulated (Count): {}".format( + violation_status['acc_ppt_pwr'])) + print(" Current TVIOL (acc_socket_thrm) Accumulated (Count): {}".format( + violation_status['acc_socket_thrm'])) + print(" Current VR_THRM Accumulated (Count): {}".format( + violation_status['acc_vr_thrm'])) + print(" Current HBM Thrm Accumulated (Count): {}".format( + violation_status['acc_hbm_thrm'])) + print(" Prochot Thrm Violation (%): {}".format( violation_status['per_prochot_thrm'])) print(" PVIOL (per_ppt_pwr) (%): {}".format(