From 1bc244b911c3840312ea57102c2b405ccc97d0cb Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 2 May 2024 01:48:14 -0500 Subject: [PATCH] SWDEV-444567 - Added Ring Hang Event Signed-off-by: Maisam Arif Change-Id: I2e73ba08ee0004f6f30660b2fa425ea94bafceca [ROCm/amdsmi commit: 52843152a533ee4435e861ca65b1ed878a9303d2] --- projects/amdsmi/CHANGELOG.md | 7 +++++++ projects/amdsmi/include/amd_smi/amdsmi.h | 4 +++- projects/amdsmi/py-interface/README.md | 3 ++- projects/amdsmi/py-interface/amdsmi_interface.py | 2 ++ projects/amdsmi/py-interface/amdsmi_wrapper.py | 9 +++++++-- .../amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h | 1 + .../amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h | 5 ++++- .../rocm_smi/python_smi_tools/rsmiBindings.py | 14 ++++++++------ projects/amdsmi/tests/amd_smi_test/test_utils.cc | 1 + 9 files changed, 35 insertions(+), 11 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 84ead71a7b..114836e8d6 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -8,6 +8,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Added +- **Added Ring Hang event** +Added `AMDSMI_EVT_NOTIF_RING_HANG` to the possible events in the `amdsmi_evt_notification_type_t` enum. + - **Added process isolation and clean shader APIs and CLI commands** Added APIs CLI and APIs to address LeftoverLocals security issues. Allowing clearing the sram data and setting process isolation on a per GPU basis. New APIs: - `amdsmi_get_gpu_process_isolation()` @@ -136,6 +139,10 @@ Updates required `amdsmi_get_power_cap_info` to return in uW as originally refle - **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. +### Known Issues + +- N/A + ## amd_smi_lib for ROCm 6.1.1 ### Added diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index dcfdfcf7e6..bfc9967228 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -835,13 +835,15 @@ typedef struct { * Event notification event types */ typedef enum { + AMDSMI_EVT_NOTIF_NONE = 0, //!< Not used AMDSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault AMDSMI_EVT_NOTIF_FIRST = AMDSMI_EVT_NOTIF_VMFAULT, AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2, AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3, AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4, + AMDSMI_EVT_NOTIF_RING_HANG = 5, - AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_GPU_POST_RESET + AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_RING_HANG } amdsmi_evt_notification_type_t; /** diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 6bbe75be7f..1d1d9cd5c6 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -1145,8 +1145,9 @@ Event Type | Description ---|------ `VMFAULT` | VM page fault `THERMAL_THROTTLE` | thermal throttle -`GPU_PRE_RESET` | gpu pre reset +`GPU_PRE_RESET` | gpu pre reset `GPU_POST_RESET` | gpu post reset +`RING_HANG` | ring hang event #### read diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 793ffdec61..b5d6f3ecf0 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -217,10 +217,12 @@ class AmdSmiCounterCommand(IntEnum): class AmdSmiEvtNotificationType(IntEnum): + NONE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_NONE VMFAULT = amdsmi_wrapper.AMDSMI_EVT_NOTIF_VMFAULT THERMAL_THROTTLE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_THERMAL_THROTTLE GPU_PRE_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_PRE_RESET GPU_POST_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_POST_RESET + RING_HANG = amdsmi_wrapper.AMDSMI_EVT_NOTIF_RING_HANG class AmdSmiTemperatureMetric(IntEnum): diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 03f4a952f7..912199d186 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -1157,19 +1157,23 @@ amdsmi_counter_value_t = struct_amdsmi_counter_value_t # values for enumeration 'amdsmi_evt_notification_type_t' amdsmi_evt_notification_type_t__enumvalues = { + 0: 'AMDSMI_EVT_NOTIF_NONE', 1: 'AMDSMI_EVT_NOTIF_VMFAULT', 1: 'AMDSMI_EVT_NOTIF_FIRST', 2: 'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE', 3: 'AMDSMI_EVT_NOTIF_GPU_PRE_RESET', 4: 'AMDSMI_EVT_NOTIF_GPU_POST_RESET', - 4: 'AMDSMI_EVT_NOTIF_LAST', + 5: 'AMDSMI_EVT_NOTIF_RING_HANG', + 5: 'AMDSMI_EVT_NOTIF_LAST', } +AMDSMI_EVT_NOTIF_NONE = 0 AMDSMI_EVT_NOTIF_VMFAULT = 1 AMDSMI_EVT_NOTIF_FIRST = 1 AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3 AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4 -AMDSMI_EVT_NOTIF_LAST = 4 +AMDSMI_EVT_NOTIF_RING_HANG = 5 +AMDSMI_EVT_NOTIF_LAST = 5 amdsmi_evt_notification_type_t = ctypes.c_uint32 # enum class struct_amdsmi_evt_notification_data_t(Structure): pass @@ -2393,6 +2397,7 @@ __all__ = \ 'AMDSMI_EVNT_XGMI_LAST', 'AMDSMI_EVT_NOTIF_FIRST', 'AMDSMI_EVT_NOTIF_GPU_POST_RESET', 'AMDSMI_EVT_NOTIF_GPU_PRE_RESET', 'AMDSMI_EVT_NOTIF_LAST', + 'AMDSMI_EVT_NOTIF_NONE', 'AMDSMI_EVT_NOTIF_RING_HANG', 'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE', 'AMDSMI_EVT_NOTIF_VMFAULT', 'AMDSMI_FILE_NOT_FOUND', 'AMDSMI_FREQ_IND_INVALID', 'AMDSMI_FREQ_IND_MAX', 'AMDSMI_FREQ_IND_MIN', diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h b/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h index 3b781ce129..6477f44898 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h @@ -553,6 +553,7 @@ enum kfd_smi_event { KFD_SMI_EVENT_THERMAL_THROTTLE = 2, KFD_SMI_EVENT_GPU_PRE_RESET = 3, KFD_SMI_EVENT_GPU_POST_RESET = 4, + KFD_SMI_EVENT_RING_HANG = 5, }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 0fafa31c8f..8d2c3668e0 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -357,13 +357,16 @@ typedef struct { * Event notification event types */ typedef enum { + RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT, + RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET, + RSMI_EVT_NOTIF_RING_HANG = KFD_SMI_EVENT_RING_HANG, - RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET + RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG } rsmi_evt_notification_type_t; /** diff --git a/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py b/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py index 94d493d7ea..e0c52d270f 100644 --- a/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py +++ b/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py @@ -134,16 +134,18 @@ class rsmi_dev_perf_level_t(c_int): RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 -notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET'] +notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG'] class rsmi_evt_notification_type_t(c_int): - RSMI_EVT_NOTIF_VMFAULT = 0 + RSMI_EVT_NOTIF_NONE = 0 + RSMI_EVT_NOTIF_VMFAULT = 1 RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT - RSMI_EVT_NOTIF_THERMAL_THROTTLE = 1 - RSMI_EVT_NOTIF_GPU_PRE_RESET = 2 - RSMI_EVT_NOTIF_GPU_POST_RESET = 3 - RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET + RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 + RSMI_EVT_NOTIF_GPU_PRE_RESET = 3 + RSMI_EVT_NOTIF_GPU_POST_RESET = 4 + RSMI_EVT_NOTIF_RING_HANG = 5 + RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG class rsmi_voltage_metric_t(c_int): diff --git a/projects/amdsmi/tests/amd_smi_test/test_utils.cc b/projects/amdsmi/tests/amd_smi_test/test_utils.cc index a0b3e8e9e3..0c0cc8a9b6 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_utils.cc +++ b/projects/amdsmi/tests/amd_smi_test/test_utils.cc @@ -84,6 +84,7 @@ static const std::map {AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, "AMDSMI_EVT_NOTIF_THERMAL_THROTTLE"}, {AMDSMI_EVT_NOTIF_GPU_PRE_RESET, "AMDSMI_EVT_NOTIF_GPU_PRE_RESET"}, {AMDSMI_EVT_NOTIF_GPU_POST_RESET, "AMDSMI_EVT_NOTIF_GPU_POST_RESET"}, + {AMDSMI_EVT_NOTIF_RING_HANG, "AMDSMI_EVT_NOTIF_RING_HANG"}, }; const char * NameFromEvtNotifType(amdsmi_evt_notification_type_t evt) {