From 24f25b413362cd21b41675ab041f3b5ad0efcee8 Mon Sep 17 00:00:00 2001 From: "Pham, Gabriel" Date: Tue, 27 May 2025 12:21:38 -0500 Subject: [PATCH] Updated docs with new KFD events (#382) * Updated docs with new KFD events --------- Signed-off-by: Pham, Gabriel [ROCm/amdsmi commit: c40d4291f65d6d7d6e8dd4bc7cfac9b885c6b258] --- .../amdsmi/docs/reference/amdsmi-py-api.md | 13 +++++-- projects/amdsmi/include/amd_smi/amdsmi.h | 34 ++++++++++--------- .../rocm_smi/include/rocm_smi/rocm_smi.h | 30 ++++++++-------- 3 files changed, 45 insertions(+), 32 deletions(-) diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index 6cf8f186d0..8051616fd2 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -1341,9 +1341,18 @@ Event Type | Description ---|------ `VMFAULT` | VM page fault `THERMAL_THROTTLE` | thermal throttle -`GPU_PRE_RESET` | gpu pre reset +`GPU_PRE_RESET` | gpu pre reset; this event includes a message which indicates the cause of the reset. They are as follows: `job hang`, `RAS error`, `MES hang`, `HWS hang`, `user trigger`, and `unknown` `GPU_POST_RESET` | gpu post reset -`RING_HANG` | ring hang event +`RING_HANG` | ring hang event; This event will be deprecated in ROCm 7.0 +`MIGRATE_START` | migrate start +`MIGRATE_END` | migrate end +`PAGE_FAULT_START` | page fault start +`PAGE_FAULT_END` | page fault end +`QUEUE_EVICTION` | queue eviction +`QUEUE_RESTORE` | queue restore +`UNMAP_FROM_GPU` | unmap from GPU +`PROCESS_START` | KFD process start +`PROCESS_END` | KFD process end #### read diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index acaea3bdf3..c3221dd6ad 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -1234,23 +1234,25 @@ typedef struct { * @cond @tag{gpu_bm_linux} @endcond */ typedef enum { - AMDSMI_EVT_NOTIF_NONE = 0, //!< Not used - AMDSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault + AMDSMI_EVT_NOTIF_NONE = 0, //!< Not used + AMDSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault AMDSMI_EVT_NOTIF_FIRST = AMDSMI_EVT_NOTIF_VMFAULT, - AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2, - AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3, - AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4, - AMDSMI_EVT_NOTIF_RING_HANG = 5, // Ringhang now maps to AMDSMI_EVT_NOTIF_MIGRATE_START. - // Will be depreciated in 7.0 - AMDSMI_EVT_NOTIF_MIGRATE_START = AMDSMI_EVT_NOTIF_RING_HANG, - AMDSMI_EVT_NOTIF_MIGRATE_END = 6, - AMDSMI_EVT_NOTIF_PAGE_FAULT_START = 7, - AMDSMI_EVT_NOTIF_PAGE_FAULT_END = 8, - AMDSMI_EVT_NOTIF_QUEUE_EVICTION = 9, - AMDSMI_EVT_NOTIF_QUEUE_RESTORE = 10, - AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11, - AMDSMI_EVT_NOTIF_PROCESS_START = 12, - AMDSMI_EVT_NOTIF_PROCESS_END = 13, + AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2, //!< thermal throttle + AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3, //!< pre reset; event includes message indicating cause + //!< causes include job hang, RAS error, + //!< MES hang, HWS hang, user trigger, and unknown + AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4, //!< post reset + AMDSMI_EVT_NOTIF_RING_HANG = 5, //!< Ringhang now maps to AMDSMI_EVT_NOTIF_MIGRATE_START. + //!< Will be deprecated in ROCm 7.0 + AMDSMI_EVT_NOTIF_MIGRATE_START = AMDSMI_EVT_NOTIF_RING_HANG, //!< migrate start + AMDSMI_EVT_NOTIF_MIGRATE_END = 6, //!< migrate end + AMDSMI_EVT_NOTIF_PAGE_FAULT_START = 7, //!< page fault start + AMDSMI_EVT_NOTIF_PAGE_FAULT_END = 8, //!< page fault end + AMDSMI_EVT_NOTIF_QUEUE_EVICTION = 9, //!< queue eviction + AMDSMI_EVT_NOTIF_QUEUE_RESTORE = 10, //!< queue restore + AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11, //!< unmap from GPU + AMDSMI_EVT_NOTIF_PROCESS_START = 12, //!< KFD process start + AMDSMI_EVT_NOTIF_PROCESS_END = 13, //!< KFD process end AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_PROCESS_END } amdsmi_evt_notification_type_t; diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 5e3d10ad7d..672577ea94 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -338,21 +338,23 @@ typedef struct { * Event notification event types */ typedef enum { - RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused - RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault + RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused + RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT, - RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, - RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET, - RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET, - RSMI_EVT_NOTIF_EVENT_MIGRATE_START = KFD_SMI_EVENT_MIGRATE_START, - RSMI_EVT_NOTIF_EVENT_MIGRATE_END = KFD_SMI_EVENT_MIGRATE_END, - RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = KFD_SMI_EVENT_PAGE_FAULT_START, - RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = KFD_SMI_EVENT_PAGE_FAULT_END, - RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION, - RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE, - RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU, - RSMI_EVT_NOTIF_EVENT_PROCESS_START = KFD_SMI_EVENT_PROCESS_START, - RSMI_EVT_NOTIF_EVENT_PROCESS_END = KFD_SMI_EVENT_PROCESS_END, + RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, //!< thermal throttle + RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET, //!< pre reset; event includes message indicating cause + //!< causes include job hang, RAS error, + //!< MES hang, HWS hang, user trigger, and unknown + RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET, //!< post reset + RSMI_EVT_NOTIF_EVENT_MIGRATE_START = KFD_SMI_EVENT_MIGRATE_START, //!< migrate start + RSMI_EVT_NOTIF_EVENT_MIGRATE_END = KFD_SMI_EVENT_MIGRATE_END, //!< migrate end + RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = KFD_SMI_EVENT_PAGE_FAULT_START, //!< page fault start + RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = KFD_SMI_EVENT_PAGE_FAULT_END, //!< page fault end + RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION, //!< queue eviction + RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE, //!< queue restore + RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU, //!< unmap from GPU + RSMI_EVT_NOTIF_EVENT_PROCESS_START = KFD_SMI_EVENT_PROCESS_START, //!< KFD process start + RSMI_EVT_NOTIF_EVENT_PROCESS_END = KFD_SMI_EVENT_PROCESS_END, //!< KFD process end RSMI_EVT_NOTIF_EVENT_ALL_PROCESS = KFD_SMI_EVENT_ALL_PROCESS, RSMI_EVT_NOTIF_LAST = KFD_SMI_EVENT_ALL_PROCESS } rsmi_evt_notification_type_t;