libhsakmt: Handle HW_EXCEPTION events
Add new structures for HW Exception events and copy data from KFD to expose to upper layers. Change-Id: Icd5eb98997c47620e3b86277ab6d3abb7ed7d56f
This commit is contained in:
@@ -1044,6 +1044,21 @@ typedef struct _HsaMemoryAccessFault
|
||||
HSA_EVENTID_MEMORYFLAGS Flags; // event flags
|
||||
} HsaMemoryAccessFault;
|
||||
|
||||
typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE
|
||||
{
|
||||
HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang
|
||||
HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error
|
||||
} HSA_EVENTID_HW_EXCEPTION_CAUSE;
|
||||
|
||||
// data associated with HSA_EVENTID_HW_EXCEPTION
|
||||
typedef struct _HsaHwException
|
||||
{
|
||||
HSAuint32 NodeId; // Node Id where the memory exception occured
|
||||
HSAuint32 ResetType;
|
||||
HSAuint32 MemoryLost;
|
||||
HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause;
|
||||
} HsaHwException;
|
||||
|
||||
typedef struct _HsaEventData
|
||||
{
|
||||
HSA_EVENTTYPE EventType; //event type
|
||||
@@ -1062,6 +1077,8 @@ typedef struct _HsaEventData
|
||||
// data associated with HSA_EVENTTYPE_MEMORY
|
||||
HsaMemoryAccessFault MemoryAccessFault;
|
||||
|
||||
// data associated with HSA_EVENTTYPE_HW_EXCEPTION
|
||||
HsaHwException HwException;
|
||||
} EventData;
|
||||
|
||||
// the following data entries are internal to the KFD & thunk itself.
|
||||
|
||||
@@ -428,6 +428,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
|
||||
((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0;
|
||||
Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS;
|
||||
analysis_memory_exception(&event_data[i].memory_exception_data);
|
||||
} else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION &&
|
||||
event_data[i].hw_exception_data.gpu_id) {
|
||||
|
||||
result = gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId);
|
||||
if (result != HSAKMT_STATUS_SUCCESS)
|
||||
goto out;
|
||||
|
||||
Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type;
|
||||
Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause;
|
||||
Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user