[SWDEV-463406] Add volation_status current counter/accumulated values
Changes:
- amdsmi_violation_status_t now includes current accumulated/counter
values
- Tests/wrapper now include added values
- Removed ASIC references in header for host/bm alignment
- Fix violation_status->per_hbm_thrm /
violation_status->active_hbm_thrm
calculations.
Change-Id: Ic86a7cbad5198a41018f82f6b588b83158d9ba0b
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Этот коммит содержится в:
@@ -2091,7 +2091,7 @@ class AMDSMICommands():
|
||||
if "throttle" in current_platform_args:
|
||||
if args.throttle:
|
||||
throttle_status = {
|
||||
# gpu metric values
|
||||
# violation status values - counter/accumulated
|
||||
'accumulation_counter': "N/A",
|
||||
'prochot_accumulated': "N/A",
|
||||
'ppt_accumulated': "N/A",
|
||||
@@ -2114,20 +2114,15 @@ class AMDSMICommands():
|
||||
'hbm_thermal_violation_percent': "N/A"
|
||||
}
|
||||
|
||||
try:
|
||||
throttle_status['accumulation_counter'] = gpu_metric['accumulation_counter']
|
||||
throttle_status['prochot_accumulated'] = gpu_metric['prochot_residency_acc']
|
||||
throttle_status['ppt_accumulated'] = gpu_metric['ppt_residency_acc']
|
||||
throttle_status['socket_thermal_accumulated'] = gpu_metric['socket_thm_residency_acc']
|
||||
throttle_status['vr_thermal_accumulated'] = gpu_metric['vr_thm_residency_acc']
|
||||
throttle_status['hbm_thermal_accumulated'] = gpu_metric['hbm_thm_residency_acc']
|
||||
|
||||
except Exception as e:
|
||||
values_dict['throttle'] = throttle_status
|
||||
logging.debug("Failed to get gpu metric information for throttle status' for gpu %s | %s", gpu_id, e)
|
||||
|
||||
try:
|
||||
violation_status = amdsmi_interface.amdsmi_get_violation_status(args.gpu)
|
||||
throttle_status['accumulation_counter'] = violation_status['acc_counter']
|
||||
throttle_status['prochot_accumulated'] = violation_status['acc_prochot_thrm']
|
||||
throttle_status['ppt_accumulated'] = violation_status['acc_ppt_pwr']
|
||||
throttle_status['socket_thermal_accumulated'] = violation_status['acc_socket_thrm']
|
||||
throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm']
|
||||
throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm']
|
||||
|
||||
throttle_status['prochot_violation_active'] = violation_status['active_prochot_thrm']
|
||||
throttle_status['ppt_violation_active'] = violation_status['active_ppt_pwr']
|
||||
throttle_status['socket_thermal_violation_active'] = violation_status['active_socket_thrm']
|
||||
|
||||
@@ -528,11 +528,16 @@ typedef struct {
|
||||
} amdsmi_vram_usage_t;
|
||||
/**
|
||||
* @brief This structure hold violation status information.
|
||||
* Note: for MI3x asics and higher, older ASICs will show unsupported.
|
||||
*/
|
||||
typedef struct {
|
||||
uint64_t reference_timestamp; //!< Represents CPU timestamp in microseconds (uS)
|
||||
uint64_t violation_timestamp; //!< Violation time in milliseconds (ms)
|
||||
uint64_t acc_counter; //!< Current accumulated counter; Max uint64 means unsupported
|
||||
uint64_t acc_prochot_thrm; //!< Current accumulated processor hot violation count; Max uint64 means unsupported
|
||||
uint64_t acc_ppt_pwr; //!< PVIOL; Current accumulated Package Power Tracking (PPT) count; Max uint64 means unsupported
|
||||
uint64_t acc_socket_thrm; //!< TVIOL; Current accumulated Socket thermal count; Max uint64 means unsupported
|
||||
uint64_t acc_vr_thrm; //!< Current accumulated voltage regulator count; Max uint64 means unsupported
|
||||
uint64_t acc_hbm_thrm; //!< Current accumulated High Bandwidth Memory (HBM) thermal count; Max uint64 means unsupported
|
||||
uint64_t per_prochot_thrm; //!< Processor hot violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint64_t per_ppt_pwr; //!< PVIOL; Package Power Tracking (PPT) violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint64_t per_socket_thrm; //!< TVIOL; Socket thermal violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
@@ -543,7 +548,7 @@ typedef struct {
|
||||
uint8_t active_socket_thrm; //!< Socket thermal violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint8_t active_vr_thrm; //!< Voltage regulator violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint8_t active_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint64_t reserved[24]; // Reserved for new violation info
|
||||
uint64_t reserved[30]; // Reserved for new violation info
|
||||
} amdsmi_violation_status_t;
|
||||
typedef struct {
|
||||
amdsmi_range_t supported_freq_range;
|
||||
|
||||
@@ -2020,6 +2020,12 @@ def amdsmi_get_violation_status(
|
||||
return {
|
||||
"reference_timestamp": _validate_if_max_uint(violation_status.reference_timestamp, MaxUIntegerTypes.UINT64_T),
|
||||
"violation_timestamp": _validate_if_max_uint(violation_status.violation_timestamp, MaxUIntegerTypes.UINT64_T),
|
||||
"acc_counter": _validate_if_max_uint(violation_status.acc_counter, MaxUIntegerTypes.UINT64_T),
|
||||
"acc_prochot_thrm": _validate_if_max_uint(violation_status.acc_prochot_thrm, MaxUIntegerTypes.UINT64_T),
|
||||
"acc_ppt_pwr": _validate_if_max_uint(violation_status.acc_ppt_pwr, MaxUIntegerTypes.UINT64_T), #PVIOL
|
||||
"acc_socket_thrm": _validate_if_max_uint(violation_status.acc_socket_thrm, MaxUIntegerTypes.UINT64_T), #TVIOL
|
||||
"acc_vr_thrm": _validate_if_max_uint(violation_status.acc_vr_thrm, MaxUIntegerTypes.UINT64_T),
|
||||
"acc_hbm_thrm": _validate_if_max_uint(violation_status.acc_hbm_thrm, MaxUIntegerTypes.UINT64_T),
|
||||
"per_prochot_thrm": _validate_if_max_uint(violation_status.per_prochot_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True),
|
||||
"per_ppt_pwr": _validate_if_max_uint(violation_status.per_ppt_pwr, MaxUIntegerTypes.UINT64_T, isActivity=True), #PVIOL
|
||||
"per_socket_thrm": _validate_if_max_uint(violation_status.per_socket_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), #TVIOL
|
||||
|
||||
@@ -734,6 +734,12 @@ struct_amdsmi_violation_status_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_violation_status_t._fields_ = [
|
||||
('reference_timestamp', ctypes.c_uint64),
|
||||
('violation_timestamp', ctypes.c_uint64),
|
||||
('acc_counter', ctypes.c_uint64),
|
||||
('acc_prochot_thrm', ctypes.c_uint64),
|
||||
('acc_ppt_pwr', ctypes.c_uint64),
|
||||
('acc_socket_thrm', ctypes.c_uint64),
|
||||
('acc_vr_thrm', ctypes.c_uint64),
|
||||
('acc_hbm_thrm', ctypes.c_uint64),
|
||||
('per_prochot_thrm', ctypes.c_uint64),
|
||||
('per_ppt_pwr', ctypes.c_uint64),
|
||||
('per_socket_thrm', ctypes.c_uint64),
|
||||
@@ -745,7 +751,7 @@ struct_amdsmi_violation_status_t._fields_ = [
|
||||
('active_vr_thrm', ctypes.c_ubyte),
|
||||
('active_hbm_thrm', ctypes.c_ubyte),
|
||||
('PADDING_0', ctypes.c_ubyte * 3),
|
||||
('reserved', ctypes.c_uint64 * 24),
|
||||
('reserved', ctypes.c_uint64 * 30),
|
||||
]
|
||||
|
||||
amdsmi_violation_status_t = struct_amdsmi_violation_status_t
|
||||
@@ -798,19 +804,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
|
||||
class struct_amdsmi_pcie_info_t(Structure):
|
||||
pass
|
||||
|
||||
class struct_pcie_static_(Structure):
|
||||
pass
|
||||
|
||||
struct_pcie_static_._pack_ = 1 # source:False
|
||||
struct_pcie_static_._fields_ = [
|
||||
('max_pcie_width', ctypes.c_uint16),
|
||||
('PADDING_0', ctypes.c_ubyte * 2),
|
||||
('max_pcie_speed', ctypes.c_uint32),
|
||||
('pcie_interface_version', ctypes.c_uint32),
|
||||
('slot_type', amdsmi_card_form_factor_t),
|
||||
('reserved', ctypes.c_uint64 * 10),
|
||||
]
|
||||
|
||||
class struct_pcie_metric_(Structure):
|
||||
pass
|
||||
|
||||
@@ -831,6 +824,19 @@ struct_pcie_metric_._fields_ = [
|
||||
('reserved', ctypes.c_uint64 * 12),
|
||||
]
|
||||
|
||||
class struct_pcie_static_(Structure):
|
||||
pass
|
||||
|
||||
struct_pcie_static_._pack_ = 1 # source:False
|
||||
struct_pcie_static_._fields_ = [
|
||||
('max_pcie_width', ctypes.c_uint16),
|
||||
('PADDING_0', ctypes.c_ubyte * 2),
|
||||
('max_pcie_speed', ctypes.c_uint32),
|
||||
('pcie_interface_version', ctypes.c_uint32),
|
||||
('slot_type', amdsmi_card_form_factor_t),
|
||||
('reserved', ctypes.c_uint64 * 10),
|
||||
]
|
||||
|
||||
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_pcie_info_t._fields_ = [
|
||||
('pcie_static', struct_pcie_static_),
|
||||
|
||||
@@ -637,6 +637,14 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
|
||||
violation_status->reference_timestamp = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->violation_timestamp = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
violation_status->acc_counter = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_prochot_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_ppt_pwr = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_socket_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_vr_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_hbm_thrm = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
violation_status->per_prochot_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->per_ppt_pwr = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->per_socket_thrm = std::numeric_limits<uint64_t>::max();
|
||||
@@ -702,6 +710,14 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
return status;
|
||||
}
|
||||
|
||||
// Insert current accumulator counters into struct
|
||||
violation_status->acc_counter = metric_info_b.accumulation_counter;
|
||||
violation_status->acc_prochot_thrm = metric_info_b.prochot_residency_acc;
|
||||
violation_status->acc_ppt_pwr = metric_info_b.ppt_residency_acc;
|
||||
violation_status->acc_socket_thrm = metric_info_b.socket_thm_residency_acc;
|
||||
violation_status->acc_vr_thrm = metric_info_b.vr_thm_residency_acc;
|
||||
violation_status->acc_hbm_thrm = metric_info_b.hbm_thm_residency_acc;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | "
|
||||
<< "[gpu_metrics A] metric_info_a.accumulation_counter: " << std::dec
|
||||
<< metric_info_a.accumulation_counter
|
||||
@@ -818,7 +834,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
}
|
||||
if ( (metric_info_b.hbm_thm_residency_acc != std::numeric_limits<uint64_t>::max()
|
||||
|| metric_info_a.hbm_thm_residency_acc != std::numeric_limits<uint64_t>::max())
|
||||
&& (metric_info_b.hbm_thm_residency_acc >= metric_info_a.vr_thm_residency_acc)
|
||||
&& (metric_info_b.hbm_thm_residency_acc >= metric_info_a.hbm_thm_residency_acc)
|
||||
&& ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0) ) {
|
||||
violation_status->per_hbm_thrm =
|
||||
(((metric_info_b.hbm_thm_residency_acc -
|
||||
|
||||
@@ -864,7 +864,8 @@ class TestAmdSmiPythonInterface(unittest.TestCase):
|
||||
print()
|
||||
self.tearDown()
|
||||
|
||||
# Only supported on MI300+ ASICs
|
||||
# amdsmi_get_violation_status is only supported on MI300+ ASICs
|
||||
# We should expect a not supported status for Navi / MI100 / MI2x ASICs
|
||||
@handle_exceptions
|
||||
def test_get_violation_status(self):
|
||||
self.setUp()
|
||||
@@ -882,6 +883,17 @@ class TestAmdSmiPythonInterface(unittest.TestCase):
|
||||
print(" Violation Timestamp: {}".format(
|
||||
violation_status['violation_timestamp']))
|
||||
|
||||
print(" Current Prochot Thrm Accumulated (Count): {}".format(
|
||||
violation_status['acc_prochot_thrm']))
|
||||
print(" Current PVIOL (acc_ppt_pwr) Accumulated (Count): {}".format(
|
||||
violation_status['acc_ppt_pwr']))
|
||||
print(" Current TVIOL (acc_socket_thrm) Accumulated (Count): {}".format(
|
||||
violation_status['acc_socket_thrm']))
|
||||
print(" Current VR_THRM Accumulated (Count): {}".format(
|
||||
violation_status['acc_vr_thrm']))
|
||||
print(" Current HBM Thrm Accumulated (Count): {}".format(
|
||||
violation_status['acc_hbm_thrm']))
|
||||
|
||||
print(" Prochot Thrm Violation (%): {}".format(
|
||||
violation_status['per_prochot_thrm']))
|
||||
print(" PVIOL (per_ppt_pwr) (%): {}".format(
|
||||
|
||||
Ссылка в новой задаче
Block a user