diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index c924d14545..674b965b8d 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -203,8 +203,7 @@ typedef union unsigned int WaveLaunchTrapOverrideSupported: 1; // Indicates if Wave Launch Trap Override is supported on the node. unsigned int WaveLaunchModeSupported: 1; // Indicates if Wave Launch Mode is supported on the node. unsigned int PreciseMemoryOperationsSupported: 1; // Indicates if Precise Memory Operations are supported on the node. - unsigned int RasSupported : 1; // Indicates if GPU RAS feature is enabled - unsigned int Reserved : 12; + unsigned int Reserved : 13; } ui32; } HSA_CAPABILITY; @@ -917,47 +916,6 @@ typedef struct _HsaMemoryAccessFault HSA_EVENTID_MEMORYFLAGS Flags; // event flags } HsaMemoryAccessFault; -typedef enum _HSA_HW_EXCEPTION_TYPE -{ - HSA_HW_EXCEPTION_TYPE_FULL_GPU_RESET = 0, - HSA_HW_EXCEPTION_TYPE_PER_ENGINE_RESET = 1, - HSA_HW_EXCEPTION_TYPE_RAS_FATAL = 2, - HSA_HW_EXCEPTION_TYPE_RAS_NOTIFY = 3 -} HSA_HW_EXCEPTION_TYPE; - -typedef struct _HSA_HW_EXCEPTION_CAUSE -{ - unsigned int Hang : 1; // GPU hang - unsigned int Parity : 1; // RAS_ERROR_PARITY - unsigned int SingleCorrectable : 1; // ERROR_SINGLE_CORRECTABLE - unsigned int MultiUncorrectable: 1; // ERROR_MULTI_UNCORRECTABLE - unsigned int Poison : 1; // ERROR_POISON - unsigned int Reserved : 27; -} HSA_HW_EXCEPTION_CAUSE; - -typedef enum _HSA_HW_EXCEPTION_BLOCK_ID -{ - HSA_HW_EXCEPTION_BLOCK_UMC = 0, - HSA_HW_EXCEPTION_BLOCK_SDMA = 1, - HSA_HW_EXCEPTION_BLOCK_GFXHUB = 2, - HSA_HW_EXCEPTION_BLOCK_MMHUB = 3, - HSA_HW_EXCEPTION_BLOCK_ATHUB = 4, - HSA_HW_EXCEPTION_BLOCK_PCIE_BIF = 5, - HSA_HW_EXCEPTION_BLOCK_HDP = 6, - - HSA_HW_EXCEPTION_BLOCK_UNKNOWN = 0xffffffff -} HSA_HW_EXCEPTION_BLOCK_ID; - -// data associated with HSA_EVENTTYPE_HW_EXCEPTION -typedef struct _HsaHwException -{ - HSA_HW_EXCEPTION_TYPE Type; - HSA_HW_EXCEPTION_CAUSE Cause; - HSA_HW_EXCEPTION_BLOCK_ID BlockId; // Id of block on where RAS error happens - bool MemoryLost; - HSAuint32 NodeId; // Id of GPU that has hw exception -} HsaHwException; - typedef struct _HsaEventData { HSA_EVENTTYPE EventType; //event type @@ -976,9 +934,6 @@ typedef struct _HsaEventData // data associated with HSA_EVENTTYPE_MEMORY HsaMemoryAccessFault MemoryAccessFault; - // data associated with HSA_EVENTTYPE_HW_EXCEPTION - HsaHwException HwException; - } EventData; // the following data entries are internal to the KFD & thunk itself. diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h index d6261a9227..780d6928e1 100644 --- a/include/linux/kfd_ioctl.h +++ b/include/linux/kfd_ioctl.h @@ -238,28 +238,13 @@ struct kfd_ioctl_dbg_trap_args { #define KFD_SIGNAL_EVENT_LIMIT 4096 -/* For kfd_event_data.hw_exception_data.type. */ +/* For kfd_event_data.hw_exception_data.reset_type. */ #define KFD_HW_EXCEPTION_WHOLE_GPU_RESET 0 #define KFD_HW_EXCEPTION_PER_ENGINE_RESET 1 -#define KFD_HW_EXCEPTION_RAS_FATAL 2 -#define KFD_HW_EXCEPTION_RAS_NOTIFY 3 -/* For kfd_event_data.hw_exception_data.cause. */ -#define KFD_HW_EXCEPTION_GPU_HANG 0x00000001 -#define KFD_HW_EXCEPTION_PARITY 0x00000002 -#define KFD_HW_EXCEPTION_SINGLE_CORRECTABLE 0x00000004 -#define KFD_HW_EXCEPTION_MULTI_UNCORRECTABLE 0x00000008 -#define KFD_HW_EXCEPTION_POISON 0x00000010 - -/*For kfd_event_data.hw_exception_data.block_id */ -#define KFD_HW_EXCEPTION_BLOCK_UMC 0 -#define HSA_HW_EXCEPTION_BLOCK_SDMA 1 -#define HSA_HW_EXCEPTION_BLOCK_GFXHUB 2 -#define HSA_HW_EXCEPTION_BLOCK_MMHUB 3 -#define HSA_HW_EXCEPTION_BLOCK_ATHUB 4 -#define HSA_HW_EXCEPTION_BLOCK_PCIE_BIF 5 -#define HSA_HW_EXCEPTION_BLOCK_HDP 6 -#define HSA_HW_EXCEPTION_BLOCK_UNKNOWN 0xffffffff +/* For kfd_event_data.hw_exception_data.reset_cause. */ +#define KFD_HW_EXCEPTION_GPU_HANG 0 +#define KFD_HW_EXCEPTION_ECC 1 struct kfd_ioctl_create_event_args { @@ -305,12 +290,10 @@ struct kfd_hsa_memory_exception_data { /* hw exception data */ struct kfd_hsa_hw_exception_data { - __u32 type; - __u32 cause; - __u32 block_id; + __u32 reset_type; + __u32 reset_cause; __u32 memory_lost; __u32 gpu_id; - __u32 pad; }; /* Event data */ diff --git a/src/events.c b/src/events.c index a5828afa0d..43d99b8da8 100644 --- a/src/events.c +++ b/src/events.c @@ -326,26 +326,6 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[], Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; analysis_memory_exception(&event_data[i].memory_exception_data); } - if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION) { - Events[i]->EventData.EventData.HwException.Type = - event_data[i].hw_exception_data.type; - Events[i]->EventData.EventData.HwException.Cause.Hang = - event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_GPU_HANG; - Events[i]->EventData.EventData.HwException.Cause.Parity = - (event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_PARITY) >> 1; - Events[i]->EventData.EventData.HwException.Cause.SingleCorrectable = - (event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_SINGLE_CORRECTABLE) >> 2; - Events[i]->EventData.EventData.HwException.Cause.MultiUncorrectable = - (event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_MULTI_UNCORRECTABLE) >> 3; - Events[i]->EventData.EventData.HwException.Cause.Poison = - (event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_POISON) >> 4; - Events[i]->EventData.EventData.HwException.BlockId = - event_data[i].hw_exception_data.block_id; - Events[i]->EventData.EventData.HwException.MemoryLost = - event_data[i].hw_exception_data.memory_lost; - Events[i]->EventData.EventData.HwException.NodeId = - event_data[i].hw_exception_data.gpu_id; - } } } out: