libhsakmt: Handle HW_EXCEPTION events

Add new structures for HW Exception events and copy data from KFD to
expose to upper layers.

Change-Id: Icd5eb98997c47620e3b86277ab6d3abb7ed7d56f
This commit is contained in:
David Yat Sin
2023-11-17 04:43:51 +00:00
parent 46fe316348
commit 01ff2f7934
2 changed files with 27 additions and 0 deletions
+17
View File
@@ -1044,6 +1044,21 @@ typedef struct _HsaMemoryAccessFault
HSA_EVENTID_MEMORYFLAGS Flags; // event flags
} HsaMemoryAccessFault;
typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE
{
HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang
HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error
} HSA_EVENTID_HW_EXCEPTION_CAUSE;
// data associated with HSA_EVENTID_HW_EXCEPTION
typedef struct _HsaHwException
{
HSAuint32 NodeId; // Node Id where the memory exception occured
HSAuint32 ResetType;
HSAuint32 MemoryLost;
HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause;
} HsaHwException;
typedef struct _HsaEventData
{
HSA_EVENTTYPE EventType; //event type
@@ -1062,6 +1077,8 @@ typedef struct _HsaEventData
// data associated with HSA_EVENTTYPE_MEMORY
HsaMemoryAccessFault MemoryAccessFault;
// data associated with HSA_EVENTTYPE_HW_EXCEPTION
HsaHwException HwException;
} EventData;
// the following data entries are internal to the KFD & thunk itself.
+10
View File
@@ -428,6 +428,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0;
Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS;
analysis_memory_exception(&event_data[i].memory_exception_data);
} else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION &&
event_data[i].hw_exception_data.gpu_id) {
result = gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId);
if (result != HSAKMT_STATUS_SUCCESS)
goto out;
Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type;
Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause;
Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost;
}
}
}