Updated docs with new KFD events (#382)

* Updated docs with new KFD events

---------

Signed-off-by: Pham, Gabriel <Gabriel.Pham@amd.com>


[ROCm/amdsmi commit: c40d4291f6]
This commit is contained in:
Pham, Gabriel
2025-05-27 12:21:38 -05:00
committad av GitHub
förälder 004c51d909
incheckning 24f25b4133
3 ändrade filer med 45 tillägg och 32 borttagningar
+11 -2
Visa fil
@@ -1341,9 +1341,18 @@ Event Type | Description
---|------
`VMFAULT` | VM page fault
`THERMAL_THROTTLE` | thermal throttle
`GPU_PRE_RESET` | gpu pre reset
`GPU_PRE_RESET` | gpu pre reset; this event includes a message which indicates the cause of the reset. They are as follows: `job hang`, `RAS error`, `MES hang`, `HWS hang`, `user trigger`, and `unknown`
`GPU_POST_RESET` | gpu post reset
`RING_HANG` | ring hang event
`RING_HANG` | ring hang event; This event will be deprecated in ROCm 7.0
`MIGRATE_START` | migrate start
`MIGRATE_END` | migrate end
`PAGE_FAULT_START` | page fault start
`PAGE_FAULT_END` | page fault end
`QUEUE_EVICTION` | queue eviction
`QUEUE_RESTORE` | queue restore
`UNMAP_FROM_GPU` | unmap from GPU
`PROCESS_START` | KFD process start
`PROCESS_END` | KFD process end
#### read
+18 -16
Visa fil
@@ -1234,23 +1234,25 @@ typedef struct {
* @cond @tag{gpu_bm_linux} @endcond
*/
typedef enum {
AMDSMI_EVT_NOTIF_NONE = 0, //!< Not used
AMDSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault
AMDSMI_EVT_NOTIF_NONE = 0, //!< Not used
AMDSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault
AMDSMI_EVT_NOTIF_FIRST = AMDSMI_EVT_NOTIF_VMFAULT,
AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2,
AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3,
AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4,
AMDSMI_EVT_NOTIF_RING_HANG = 5, // Ringhang now maps to AMDSMI_EVT_NOTIF_MIGRATE_START.
// Will be depreciated in 7.0
AMDSMI_EVT_NOTIF_MIGRATE_START = AMDSMI_EVT_NOTIF_RING_HANG,
AMDSMI_EVT_NOTIF_MIGRATE_END = 6,
AMDSMI_EVT_NOTIF_PAGE_FAULT_START = 7,
AMDSMI_EVT_NOTIF_PAGE_FAULT_END = 8,
AMDSMI_EVT_NOTIF_QUEUE_EVICTION = 9,
AMDSMI_EVT_NOTIF_QUEUE_RESTORE = 10,
AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11,
AMDSMI_EVT_NOTIF_PROCESS_START = 12,
AMDSMI_EVT_NOTIF_PROCESS_END = 13,
AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2, //!< thermal throttle
AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3, //!< pre reset; event includes message indicating cause
//!< causes include job hang, RAS error,
//!< MES hang, HWS hang, user trigger, and unknown
AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4, //!< post reset
AMDSMI_EVT_NOTIF_RING_HANG = 5, //!< Ringhang now maps to AMDSMI_EVT_NOTIF_MIGRATE_START.
//!< Will be deprecated in ROCm 7.0
AMDSMI_EVT_NOTIF_MIGRATE_START = AMDSMI_EVT_NOTIF_RING_HANG, //!< migrate start
AMDSMI_EVT_NOTIF_MIGRATE_END = 6, //!< migrate end
AMDSMI_EVT_NOTIF_PAGE_FAULT_START = 7, //!< page fault start
AMDSMI_EVT_NOTIF_PAGE_FAULT_END = 8, //!< page fault end
AMDSMI_EVT_NOTIF_QUEUE_EVICTION = 9, //!< queue eviction
AMDSMI_EVT_NOTIF_QUEUE_RESTORE = 10, //!< queue restore
AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11, //!< unmap from GPU
AMDSMI_EVT_NOTIF_PROCESS_START = 12, //!< KFD process start
AMDSMI_EVT_NOTIF_PROCESS_END = 13, //!< KFD process end
AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_PROCESS_END
} amdsmi_evt_notification_type_t;
@@ -338,21 +338,23 @@ typedef struct {
* Event notification event types
*/
typedef enum {
RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused
RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault
RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused
RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT,
RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE,
RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET,
RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET,
RSMI_EVT_NOTIF_EVENT_MIGRATE_START = KFD_SMI_EVENT_MIGRATE_START,
RSMI_EVT_NOTIF_EVENT_MIGRATE_END = KFD_SMI_EVENT_MIGRATE_END,
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = KFD_SMI_EVENT_PAGE_FAULT_START,
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = KFD_SMI_EVENT_PAGE_FAULT_END,
RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION,
RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE,
RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU,
RSMI_EVT_NOTIF_EVENT_PROCESS_START = KFD_SMI_EVENT_PROCESS_START,
RSMI_EVT_NOTIF_EVENT_PROCESS_END = KFD_SMI_EVENT_PROCESS_END,
RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, //!< thermal throttle
RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET, //!< pre reset; event includes message indicating cause
//!< causes include job hang, RAS error,
//!< MES hang, HWS hang, user trigger, and unknown
RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET, //!< post reset
RSMI_EVT_NOTIF_EVENT_MIGRATE_START = KFD_SMI_EVENT_MIGRATE_START, //!< migrate start
RSMI_EVT_NOTIF_EVENT_MIGRATE_END = KFD_SMI_EVENT_MIGRATE_END, //!< migrate end
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = KFD_SMI_EVENT_PAGE_FAULT_START, //!< page fault start
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = KFD_SMI_EVENT_PAGE_FAULT_END, //!< page fault end
RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION, //!< queue eviction
RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE, //!< queue restore
RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU, //!< unmap from GPU
RSMI_EVT_NOTIF_EVENT_PROCESS_START = KFD_SMI_EVENT_PROCESS_START, //!< KFD process start
RSMI_EVT_NOTIF_EVENT_PROCESS_END = KFD_SMI_EVENT_PROCESS_END, //!< KFD process end
RSMI_EVT_NOTIF_EVENT_ALL_PROCESS = KFD_SMI_EVENT_ALL_PROCESS,
RSMI_EVT_NOTIF_LAST = KFD_SMI_EVENT_ALL_PROCESS
} rsmi_evt_notification_type_t;