[SWDEV-527092] - Process Start/Stop event addition (#368)

- Added more events to `amdsmi_evt_notification_type_t`

Change-Id: I6a256fe828e4bec3197c7fecbed374ab17c6f850
Signed-off-by: Adam Pryor <Adam.Pryor@amd.com>
Tento commit je obsažen v:
Pryor, Adam
2025-05-16 11:01:15 -05:00
odevzdal GitHub
rodič bacbaac0b1
revize 8713305f80
8 změnil soubory, kde provedl 83 přidání a 17 odebrání
+11 -3
Zobrazit soubor
@@ -1239,9 +1239,17 @@ typedef enum {
AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2,
AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3,
AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4,
AMDSMI_EVT_NOTIF_RING_HANG = 5,
AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START = 5,
AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END = 6,
AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = 7,
AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = 8,
AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = 9,
AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = 10,
AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = 11,
AMDSMI_EVT_NOTIF_PROCESS_START = 12,
AMDSMI_EVT_NOTIF_PROCESS_END = 13,
AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_RING_HANG
AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_PROCESS_END
} amdsmi_evt_notification_type_t;
/**
@@ -1255,7 +1263,7 @@ typedef enum {
* @brief Maximum number of characters an event notification message will be
* matches kfd message max size
*/
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 256
/**
* @brief Event notification data returned from event notification API
+13 -4
Zobrazit soubor
@@ -88,7 +88,7 @@ AMDSMI_MAX_CACHE_TYPES = 10
AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK = 64
AMDSMI_GPU_UUID_SIZE = 38
MAX_AMDSMI_NAME_LENGTH = 64
MAX_EVENT_NOTIFICATION_MSG_SIZE = 96
MAX_EVENT_NOTIFICATION_MSG_SIZE = 256
_AMDSMI_STRING_LENGTH = 80
@@ -273,7 +273,15 @@ class AmdSmiEvtNotificationType(IntEnum):
THERMAL_THROTTLE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_THERMAL_THROTTLE
GPU_PRE_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_PRE_RESET
GPU_POST_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_POST_RESET
RING_HANG = amdsmi_wrapper.AMDSMI_EVT_NOTIF_RING_HANG
MIGRATE_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START
MIGRATE_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END
PAGE_FAULT_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END
PAGE_FAULT_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END
QUEUE_EVICTION = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION
QUEUE_RESTORE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE
UNMAP_FROM_GPU = amdsmi_wrapper.AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU
PROCESS_START = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PROCESS_START
PROCESS_END = amdsmi_wrapper.AMDSMI_EVT_NOTIF_PROCESS_END
class AmdSmiTemperatureMetric(IntEnum):
@@ -543,17 +551,18 @@ class AmdSmiEventReader:
processor_handle, ctypes.c_uint64(mask)))
def read(self, timestamp, num_elem=10):
c_count = ctypes.c_uint32(num_elem)
self.event_info = (amdsmi_wrapper.amdsmi_evt_notification_data_t * num_elem)()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_event_notification(
ctypes.c_int(timestamp),
ctypes.byref(ctypes.c_uint32(num_elem)),
ctypes.byref(c_count),
self.event_info,
)
)
ret = []
for i in range(0, num_elem):
for i in range(c_count.value):
unique_event_values = set(event.value for event in AmdSmiEvtNotificationType)
if self.event_info[i].event in unique_event_values:
if AmdSmiEvtNotificationType(self.event_info[i].event).name != "NONE":
+26 -6
Zobrazit soubor
@@ -1418,8 +1418,16 @@ amdsmi_evt_notification_type_t__enumvalues = {
2: 'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE',
3: 'AMDSMI_EVT_NOTIF_GPU_PRE_RESET',
4: 'AMDSMI_EVT_NOTIF_GPU_POST_RESET',
5: 'AMDSMI_EVT_NOTIF_RING_HANG',
5: 'AMDSMI_EVT_NOTIF_LAST',
5: 'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START',
6: 'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END',
7: 'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START',
8: 'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END',
9: 'AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION',
10: 'AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE',
11: 'AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU',
12: 'AMDSMI_EVT_NOTIF_PROCESS_START',
13: 'AMDSMI_EVT_NOTIF_PROCESS_END',
1: 'AMDSMI_EVT_NOTIF_LAST',
}
AMDSMI_EVT_NOTIF_NONE = 0
AMDSMI_EVT_NOTIF_VMFAULT = 1
@@ -1427,8 +1435,16 @@ AMDSMI_EVT_NOTIF_FIRST = 1
AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2
AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3
AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4
AMDSMI_EVT_NOTIF_RING_HANG = 5
AMDSMI_EVT_NOTIF_LAST = 5
AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START = 5
AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END = 6
AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = 7
AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = 8
AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = 9
AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = 10
AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = 11
AMDSMI_EVT_NOTIF_PROCESS_START = 12
AMDSMI_EVT_NOTIF_PROCESS_END = 13
AMDSMI_EVT_NOTIF_LAST = 13
amdsmi_evt_notification_type_t = ctypes.c_uint32 # enum
class struct_amdsmi_evt_notification_data_t(Structure):
pass
@@ -1437,7 +1453,7 @@ struct_amdsmi_evt_notification_data_t._pack_ = 1 # source:False
struct_amdsmi_evt_notification_data_t._fields_ = [
('processor_handle', ctypes.POINTER(None)),
('event', amdsmi_evt_notification_type_t),
('message', ctypes.c_char * 96),
('message', ctypes.c_char * 256),
('PADDING_0', ctypes.c_ubyte * 4),
]
@@ -2986,7 +3002,11 @@ __all__ = \
'AMDSMI_EVNT_XGMI_LAST', 'AMDSMI_EVT_NOTIF_FIRST',
'AMDSMI_EVT_NOTIF_GPU_POST_RESET',
'AMDSMI_EVT_NOTIF_GPU_PRE_RESET', 'AMDSMI_EVT_NOTIF_LAST',
'AMDSMI_EVT_NOTIF_NONE', 'AMDSMI_EVT_NOTIF_RING_HANG',
'AMDSMI_EVT_NOTIF_NONE', 'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START',
'AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END', 'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START',
'AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END', 'AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION',
'AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE', 'AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU',
'AMDSMI_EVT_NOTIF_PROCESS_START', 'AMDSMI_EVT_NOTIF_PROCESS_END'
'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE', 'AMDSMI_EVT_NOTIF_VMFAULT',
'AMDSMI_FINE_DECODER_ACTIVITY', 'AMDSMI_FINE_GRAIN_GFX_ACTIVITY',
'AMDSMI_FINE_GRAIN_MEM_ACTIVITY', 'AMDSMI_FREQ_IND_INVALID',
+2
Zobrazit soubor
@@ -537,6 +537,8 @@ enum kfd_smi_event {
KFD_SMI_EVENT_QUEUE_EVICTION = 9,
KFD_SMI_EVENT_QUEUE_RESTORE = 10,
KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
KFD_SMI_EVENT_PROCESS_START = 12,
KFD_SMI_EVENT_PROCESS_END = 13,
/*
* max event number, as a flag bit to get events from all processes,
+3 -1
Zobrazit soubor
@@ -351,6 +351,8 @@ typedef enum {
RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION,
RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE,
RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU,
RSMI_EVT_NOTIF_EVENT_PROCESS_START = KFD_SMI_EVENT_PROCESS_START,
RSMI_EVT_NOTIF_EVENT_PROCESS_END = KFD_SMI_EVENT_PROCESS_END,
RSMI_EVT_NOTIF_EVENT_ALL_PROCESS = KFD_SMI_EVENT_ALL_PROCESS,
RSMI_EVT_NOTIF_LAST = KFD_SMI_EVENT_ALL_PROCESS
} rsmi_evt_notification_type_t;
@@ -362,7 +364,7 @@ typedef enum {
//! Maximum number of characters an event notification message will be
// matches kfd message max size
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 256
/**
* Event notification data returned from event notification API
+15
Zobrazit soubor
@@ -7434,6 +7434,21 @@ rsmi_event_notification_get(int timeout_ms,
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_PROCESS_START:
case RSMI_EVT_NOTIF_EVENT_PROCESS_END:
{
uint32_t pid; char task[MAX_EVENT_NOTIFICATION_MSG_SIZE];
int rc = sscanf(message, "%x %s", &pid, task);
std::stringstream msg;
if (rc == 2){
msg << "PID: " << pid << " task: " << task;
} else{
LOG_ERROR("Failed to parse process event payload");
msg << "PID:UNKNOWN task:UNKNOWN";
}
snprintf(data_item->message, MAX_EVENT_NOTIFICATION_MSG_SIZE, "%s", msg.str().c_str());
}
break;
default:
strcpy(reinterpret_cast<char *>(&data_item->message), "Unknown event received");
break;
+4 -2
Zobrazit soubor
@@ -1926,8 +1926,10 @@ amdsmi_get_gpu_event_notification(int timeout_ms,
rsmi_evt_notification_data_t rsmi_data = r_data[i];
data[i].event = static_cast<amdsmi_evt_notification_type_t>(
rsmi_data.event);
strncpy(data[i].message, rsmi_data.message,
MAX_EVENT_NOTIFICATION_MSG_SIZE);
snprintf(data[i].message,
MAX_EVENT_NOTIFICATION_MSG_SIZE,
"%s",
rsmi_data.message);
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.gpu_index_to_handle(rsmi_data.dv_ind, &(data[i].processor_handle));
if (r != AMDSMI_STATUS_SUCCESS) return r;
+9 -1
Zobrazit soubor
@@ -61,7 +61,15 @@ static const std::map<amdsmi_evt_notification_type_t, const char *>
{AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, "AMDSMI_EVT_NOTIF_THERMAL_THROTTLE"},
{AMDSMI_EVT_NOTIF_GPU_PRE_RESET, "AMDSMI_EVT_NOTIF_GPU_PRE_RESET"},
{AMDSMI_EVT_NOTIF_GPU_POST_RESET, "AMDSMI_EVT_NOTIF_GPU_POST_RESET"},
{AMDSMI_EVT_NOTIF_RING_HANG, "AMDSMI_EVT_NOTIF_RING_HANG"},
{AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START, "AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START"},
{AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END, "AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END"},
{AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START, "AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START"},
{AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END, "AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END"},
{AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION, "AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION"},
{AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE, "AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE"},
{AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU, "AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU"},
{AMDSMI_EVT_NOTIF_PROCESS_START, "AMDSMI_EVT_NOTIF_PROCESS_START"},
{AMDSMI_EVT_NOTIF_PROCESS_END, "AMDSMI_EVT_NOTIF_PROCESS_END"},
};
const char *
NameFromEvtNotifType(amdsmi_evt_notification_type_t evt) {