From 8713305f80a11b9dfc080d549291d76461a477ab Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Fri, 16 May 2025 11:01:15 -0500 Subject: [PATCH] [SWDEV-527092] - Process Start/Stop event addition (#368) - Added more events to `amdsmi_evt_notification_type_t` Change-Id: I6a256fe828e4bec3197c7fecbed374ab17c6f850 Signed-off-by: Adam Pryor --- include/amd_smi/amdsmi.h | 14 +++++++++--- py-interface/amdsmi_interface.py | 17 ++++++++++---- py-interface/amdsmi_wrapper.py | 32 ++++++++++++++++++++++----- rocm_smi/include/rocm_smi/kfd_ioctl.h | 2 ++ rocm_smi/include/rocm_smi/rocm_smi.h | 4 +++- rocm_smi/src/rocm_smi.cc | 15 +++++++++++++ src/amd_smi/amd_smi.cc | 6 +++-- tests/amd_smi_test/test_utils.cc | 10 ++++++++- 8 files changed, 83 insertions(+), 17 deletions(-) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 8ea02158ef..a88fa6187f 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -1239,9 +1239,17 @@ typedef enum { AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2, AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3, AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4, - AMDSMI_EVT_NOTIF_RING_HANG = 5, + AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START = 5, + AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END = 6, + AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = 7, + AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = 8, + AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = 9, + AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = 10, + AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = 11, + AMDSMI_EVT_NOTIF_PROCESS_START = 12, + AMDSMI_EVT_NOTIF_PROCESS_END = 13, - AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_RING_HANG + AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_PROCESS_END } amdsmi_evt_notification_type_t; /** @@ -1255,7 +1263,7 @@ typedef enum { * @brief Maximum number of characters an event notification message will be * matches kfd message max size */ -#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96 +#define MAX_EVENT_NOTIFICATION_MSG_SIZE 256 /** * @brief Event notification data returned from event notification API diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 3142b7b565..fd12f155d7 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -88,7 +88,7 @@ AMDSMI_MAX_CACHE_TYPES = 10 AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK = 64 AMDSMI_GPU_UUID_SIZE = 38 MAX_AMDSMI_NAME_LENGTH = 64 -MAX_EVENT_NOTIFICATION_MSG_SIZE = 96 +MAX_EVENT_NOTIFICATION_MSG_SIZE = 256 _AMDSMI_STRING_LENGTH = 80 @@ -273,7 +273,15 @@ class AmdSmiEvtNotificationType(IntEnum): THERMAL_THROTTLE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_THERMAL_THROTTLE GPU_PRE_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_PRE_RESET GPU_POST_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_POST_RESET - RING_HANG = amdsmi_wrapper.AMDSMI_EVT_NOTIF_RING_HANG + MIGRATE_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START + MIGRATE_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END + PAGE_FAULT_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END + PAGE_FAULT_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END + QUEUE_EVICTION = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION + QUEUE_RESTORE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE + UNMAP_FROM_GPU = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU + PROCESS_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PROCESS_START + PROCESS_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PROCESS_END class AmdSmiTemperatureMetric(IntEnum): @@ -543,17 +551,18 @@ class AmdSmiEventReader: processor_handle, ctypes.c_uint64(mask))) def read(self, timestamp, num_elem=10): + c_count = ctypes.c_uint32(num_elem) self.event_info = (amdsmi_wrapper.amdsmi_evt_notification_data_t * num_elem)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_event_notification( ctypes.c_int(timestamp), - ctypes.byref(ctypes.c_uint32(num_elem)), + ctypes.byref(c_count), self.event_info, ) ) ret = [] - for i in range(0, num_elem): + for i in range(c_count.value): unique_event_values = set(event.value for event in AmdSmiEvtNotificationType) if self.event_info[i].event in unique_event_values: if AmdSmiEvtNotificationType(self.event_info[i].event).name != "NONE": diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index d9a83b9345..8cdfbf0281 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -1418,8 +1418,16 @@ amdsmi_evt_notification_type_t__enumvalues = { 2: 'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE', 3: 'AMDSMI_EVT_NOTIF_GPU_PRE_RESET', 4: 'AMDSMI_EVT_NOTIF_GPU_POST_RESET', - 5: 'AMDSMI_EVT_NOTIF_RING_HANG', - 5: 'AMDSMI_EVT_NOTIF_LAST', + 5: 'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START', + 6: 'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END', + 7: 'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START', + 8: 'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END', + 9: 'AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION', + 10: 'AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE', + 11: 'AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU', + 12: 'AMDSMI_EVT_NOTIF_PROCESS_START', + 13: 'AMDSMI_EVT_NOTIF_PROCESS_END', + 1: 'AMDSMI_EVT_NOTIF_LAST', } AMDSMI_EVT_NOTIF_NONE = 0 AMDSMI_EVT_NOTIF_VMFAULT = 1 @@ -1427,8 +1435,16 @@ AMDSMI_EVT_NOTIF_FIRST = 1 AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3 AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4 -AMDSMI_EVT_NOTIF_RING_HANG = 5 -AMDSMI_EVT_NOTIF_LAST = 5 +AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START = 5 +AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END = 6 +AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = 7 +AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = 8 +AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = 9 +AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = 10 +AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = 11 +AMDSMI_EVT_NOTIF_PROCESS_START = 12 +AMDSMI_EVT_NOTIF_PROCESS_END = 13 +AMDSMI_EVT_NOTIF_LAST = 13 amdsmi_evt_notification_type_t = ctypes.c_uint32 # enum class struct_amdsmi_evt_notification_data_t(Structure): pass @@ -1437,7 +1453,7 @@ struct_amdsmi_evt_notification_data_t._pack_ = 1 # source:False struct_amdsmi_evt_notification_data_t._fields_ = [ ('processor_handle', ctypes.POINTER(None)), ('event', amdsmi_evt_notification_type_t), - ('message', ctypes.c_char * 96), + ('message', ctypes.c_char * 256), ('PADDING_0', ctypes.c_ubyte * 4), ] @@ -2986,7 +3002,11 @@ __all__ = \ 'AMDSMI_EVNT_XGMI_LAST', 'AMDSMI_EVT_NOTIF_FIRST', 'AMDSMI_EVT_NOTIF_GPU_POST_RESET', 'AMDSMI_EVT_NOTIF_GPU_PRE_RESET', 'AMDSMI_EVT_NOTIF_LAST', - 'AMDSMI_EVT_NOTIF_NONE', 'AMDSMI_EVT_NOTIF_RING_HANG', + 'AMDSMI_EVT_NOTIF_NONE', 'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START', + 'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END', 'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START', + 'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END', 'AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION', + 'AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE', 'AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU', + 'AMDSMI_EVT_NOTIF_PROCESS_START', 'AMDSMI_EVT_NOTIF_PROCESS_END' 'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE', 'AMDSMI_EVT_NOTIF_VMFAULT', 'AMDSMI_FINE_DECODER_ACTIVITY', 'AMDSMI_FINE_GRAIN_GFX_ACTIVITY', 'AMDSMI_FINE_GRAIN_MEM_ACTIVITY', 'AMDSMI_FREQ_IND_INVALID', diff --git a/rocm_smi/include/rocm_smi/kfd_ioctl.h b/rocm_smi/include/rocm_smi/kfd_ioctl.h index 2334afb9c0..8bce19ddd7 100644 --- a/rocm_smi/include/rocm_smi/kfd_ioctl.h +++ b/rocm_smi/include/rocm_smi/kfd_ioctl.h @@ -537,6 +537,8 @@ enum kfd_smi_event { KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, + KFD_SMI_EVENT_PROCESS_START = 12, + KFD_SMI_EVENT_PROCESS_END = 13, /* * max event number, as a flag bit to get events from all processes, diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 09c9e54cb0..33cd65999d 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -351,6 +351,8 @@ typedef enum { RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION, RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE, RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU, + RSMI_EVT_NOTIF_EVENT_PROCESS_START = KFD_SMI_EVENT_PROCESS_START, + RSMI_EVT_NOTIF_EVENT_PROCESS_END = KFD_SMI_EVENT_PROCESS_END, RSMI_EVT_NOTIF_EVENT_ALL_PROCESS = KFD_SMI_EVENT_ALL_PROCESS, RSMI_EVT_NOTIF_LAST = KFD_SMI_EVENT_ALL_PROCESS } rsmi_evt_notification_type_t; @@ -362,7 +364,7 @@ typedef enum { //! Maximum number of characters an event notification message will be // matches kfd message max size -#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96 +#define MAX_EVENT_NOTIFICATION_MSG_SIZE 256 /** * Event notification data returned from event notification API diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 13850874b3..9f0d3131c8 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -7434,6 +7434,21 @@ rsmi_event_notification_get(int timeout_ms, strcpy(reinterpret_cast(&data_item->message), final_message.str().c_str()); } break; + case RSMI_EVT_NOTIF_EVENT_PROCESS_START: + case RSMI_EVT_NOTIF_EVENT_PROCESS_END: + { + uint32_t pid; char task[MAX_EVENT_NOTIFICATION_MSG_SIZE]; + int rc = sscanf(message, "%x %s", &pid, task); + std::stringstream msg; + if (rc == 2){ + msg << "PID: " << pid << " task: " << task; + } else{ + LOG_ERROR("Failed to parse process event payload"); + msg << "PID:UNKNOWN task:UNKNOWN"; + } + snprintf(data_item->message, MAX_EVENT_NOTIFICATION_MSG_SIZE, "%s", msg.str().c_str()); + } + break; default: strcpy(reinterpret_cast(&data_item->message), "Unknown event received"); break; diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index dfcb5161d2..63cf19b5fe 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1926,8 +1926,10 @@ amdsmi_get_gpu_event_notification(int timeout_ms, rsmi_evt_notification_data_t rsmi_data = r_data[i]; data[i].event = static_cast( rsmi_data.event); - strncpy(data[i].message, rsmi_data.message, - MAX_EVENT_NOTIFICATION_MSG_SIZE); + snprintf(data[i].message, + MAX_EVENT_NOTIFICATION_MSG_SIZE, + "%s", + rsmi_data.message); amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance() .gpu_index_to_handle(rsmi_data.dv_ind, &(data[i].processor_handle)); if (r != AMDSMI_STATUS_SUCCESS) return r; diff --git a/tests/amd_smi_test/test_utils.cc b/tests/amd_smi_test/test_utils.cc index 8f4661cdbe..73bb52e41f 100644 --- a/tests/amd_smi_test/test_utils.cc +++ b/tests/amd_smi_test/test_utils.cc @@ -61,7 +61,15 @@ static const std::map {AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, "AMDSMI_EVT_NOTIF_THERMAL_THROTTLE"}, {AMDSMI_EVT_NOTIF_GPU_PRE_RESET, "AMDSMI_EVT_NOTIF_GPU_PRE_RESET"}, {AMDSMI_EVT_NOTIF_GPU_POST_RESET, "AMDSMI_EVT_NOTIF_GPU_POST_RESET"}, - {AMDSMI_EVT_NOTIF_RING_HANG, "AMDSMI_EVT_NOTIF_RING_HANG"}, + {AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START, "AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START"}, + {AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END, "AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END"}, + {AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START, "AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START"}, + {AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END, "AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END"}, + {AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION, "AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION"}, + {AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE, "AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE"}, + {AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU, "AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU"}, + {AMDSMI_EVT_NOTIF_PROCESS_START, "AMDSMI_EVT_NOTIF_PROCESS_START"}, + {AMDSMI_EVT_NOTIF_PROCESS_END, "AMDSMI_EVT_NOTIF_PROCESS_END"}, }; const char * NameFromEvtNotifType(amdsmi_evt_notification_type_t evt) {