From 5525bf8c86e95be30defd4e8cb249f3058fee054 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 2 May 2024 02:39:20 -0500 Subject: [PATCH] AMDSMI - Add ring hang event Change-Id: I84696e3cc1a4eba8de48e464f1a208ed9c6e489d Depends-On: I2e73ba08ee0004f6f30660b2fa425ea94bafceca Signed-off-by: Galantsev, Dmitrii --- common/rdc_field.data | 1 + include/rdc/rdc.h | 3 ++- python_binding/rdc_bootstrap.py | 1 + rdc_libs/rdc/src/RdcNotificationImpl.cc | 2 ++ 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/common/rdc_field.data b/common/rdc_field.data index 58756bea9f..6d30de11d4 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -120,3 +120,4 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_VMFAULT, "VM page fault", FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp", "THERMAL_THROT", false) FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false) FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 882d360299..c2cb224b67 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -301,8 +301,9 @@ typedef enum { //!< due to temperature rise RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred + RDC_EVNT_NOTIF_RING_HANG, //!< GPU ring hang just occurred - RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_POST_RESET, + RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG, } rdc_field_t; #define RDC_EVNT_IS_NOTIF_FIELD(FIELD) \ ((FIELD) >= RDC_EVNT_NOTIF_FIRST && (FIELD) <= RDC_EVNT_NOTIF_LAST) diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index 1fc2fea9fd..7c94576961 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -146,6 +146,7 @@ class rdc_field_t(c_int): RDC_EVNT_NOTIF_THERMAL_THROTTLE = 2001 RDC_EVNT_NOTIF_PRE_RESET = 2002 RDC_EVNT_NOTIF_POST_RESET = 2003 + RDC_EVNT_NOTIF_RING_HANG = 2004 rdc_handle_t = c_void_p rdc_gpu_group_t = c_uint32 diff --git a/rdc_libs/rdc/src/RdcNotificationImpl.cc b/rdc_libs/rdc/src/RdcNotificationImpl.cc index 47434523b0..df355e18f4 100644 --- a/rdc_libs/rdc/src/RdcNotificationImpl.cc +++ b/rdc_libs/rdc/src/RdcNotificationImpl.cc @@ -45,6 +45,7 @@ static std::unordered_map rdc_2_smi {RDC_EVNT_NOTIF_THERMAL_THROTTLE, AMDSMI_EVT_NOTIF_THERMAL_THROTTLE}, {RDC_EVNT_NOTIF_PRE_RESET, AMDSMI_EVT_NOTIF_GPU_PRE_RESET}, {RDC_EVNT_NOTIF_POST_RESET, AMDSMI_EVT_NOTIF_GPU_POST_RESET}, + {RDC_EVNT_NOTIF_RING_HANG, AMDSMI_EVT_NOTIF_RING_HANG}, }; static std::unordered_map smi_event_notif_2_rdc_map = { {AMDSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT}, @@ -52,6 +53,7 @@ static std::unordered_map smi_event {AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE}, {AMDSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET}, {AMDSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET}, + {AMDSMI_EVT_NOTIF_RING_HANG, RDC_EVNT_NOTIF_RING_HANG}, }; // This const determines space allocated on stack for notification events.