From 01ff2f79346ae4a53b621eb8bd6f683d03f899c8 Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Fri, 17 Nov 2023 04:43:51 +0000 Subject: [PATCH] libhsakmt: Handle HW_EXCEPTION events Add new structures for HW Exception events and copy data from KFD to expose to upper layers. Change-Id: Icd5eb98997c47620e3b86277ab6d3abb7ed7d56f --- include/hsakmttypes.h | 17 +++++++++++++++++ src/events.c | 10 ++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index 2937c35915..9f1b2ccb24 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -1044,6 +1044,21 @@ typedef struct _HsaMemoryAccessFault HSA_EVENTID_MEMORYFLAGS Flags; // event flags } HsaMemoryAccessFault; +typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE +{ + HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang + HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error +} HSA_EVENTID_HW_EXCEPTION_CAUSE; + +// data associated with HSA_EVENTID_HW_EXCEPTION +typedef struct _HsaHwException +{ + HSAuint32 NodeId; // Node Id where the memory exception occured + HSAuint32 ResetType; + HSAuint32 MemoryLost; + HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause; +} HsaHwException; + typedef struct _HsaEventData { HSA_EVENTTYPE EventType; //event type @@ -1062,6 +1077,8 @@ typedef struct _HsaEventData // data associated with HSA_EVENTTYPE_MEMORY HsaMemoryAccessFault MemoryAccessFault; + // data associated with HSA_EVENTTYPE_HW_EXCEPTION + HsaHwException HwException; } EventData; // the following data entries are internal to the KFD & thunk itself. diff --git a/src/events.c b/src/events.c index 9ec199ac7a..60e9f6dbbf 100644 --- a/src/events.c +++ b/src/events.c @@ -428,6 +428,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], ((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0; Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; analysis_memory_exception(&event_data[i].memory_exception_data); + } else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION && + event_data[i].hw_exception_data.gpu_id) { + + result = gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId); + if (result != HSAKMT_STATUS_SUCCESS) + goto out; + + Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type; + Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause; + Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost; } } }