Revert "libhsakmt: add RAS support"
This reverts commit 1fbe010354.
Change-Id: I739b17e057f2a8a0f4375741955209d2477c704a
Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com>
Этот коммит содержится в:
коммит произвёл
JinHuiEric Huang
родитель
54807526b9
Коммит
29d11d02e8
@@ -203,8 +203,7 @@ typedef union
|
||||
unsigned int WaveLaunchTrapOverrideSupported: 1; // Indicates if Wave Launch Trap Override is supported on the node.
|
||||
unsigned int WaveLaunchModeSupported: 1; // Indicates if Wave Launch Mode is supported on the node.
|
||||
unsigned int PreciseMemoryOperationsSupported: 1; // Indicates if Precise Memory Operations are supported on the node.
|
||||
unsigned int RasSupported : 1; // Indicates if GPU RAS feature is enabled
|
||||
unsigned int Reserved : 12;
|
||||
unsigned int Reserved : 13;
|
||||
} ui32;
|
||||
} HSA_CAPABILITY;
|
||||
|
||||
@@ -917,47 +916,6 @@ typedef struct _HsaMemoryAccessFault
|
||||
HSA_EVENTID_MEMORYFLAGS Flags; // event flags
|
||||
} HsaMemoryAccessFault;
|
||||
|
||||
typedef enum _HSA_HW_EXCEPTION_TYPE
|
||||
{
|
||||
HSA_HW_EXCEPTION_TYPE_FULL_GPU_RESET = 0,
|
||||
HSA_HW_EXCEPTION_TYPE_PER_ENGINE_RESET = 1,
|
||||
HSA_HW_EXCEPTION_TYPE_RAS_FATAL = 2,
|
||||
HSA_HW_EXCEPTION_TYPE_RAS_NOTIFY = 3
|
||||
} HSA_HW_EXCEPTION_TYPE;
|
||||
|
||||
typedef struct _HSA_HW_EXCEPTION_CAUSE
|
||||
{
|
||||
unsigned int Hang : 1; // GPU hang
|
||||
unsigned int Parity : 1; // RAS_ERROR_PARITY
|
||||
unsigned int SingleCorrectable : 1; // ERROR_SINGLE_CORRECTABLE
|
||||
unsigned int MultiUncorrectable: 1; // ERROR_MULTI_UNCORRECTABLE
|
||||
unsigned int Poison : 1; // ERROR_POISON
|
||||
unsigned int Reserved : 27;
|
||||
} HSA_HW_EXCEPTION_CAUSE;
|
||||
|
||||
typedef enum _HSA_HW_EXCEPTION_BLOCK_ID
|
||||
{
|
||||
HSA_HW_EXCEPTION_BLOCK_UMC = 0,
|
||||
HSA_HW_EXCEPTION_BLOCK_SDMA = 1,
|
||||
HSA_HW_EXCEPTION_BLOCK_GFXHUB = 2,
|
||||
HSA_HW_EXCEPTION_BLOCK_MMHUB = 3,
|
||||
HSA_HW_EXCEPTION_BLOCK_ATHUB = 4,
|
||||
HSA_HW_EXCEPTION_BLOCK_PCIE_BIF = 5,
|
||||
HSA_HW_EXCEPTION_BLOCK_HDP = 6,
|
||||
|
||||
HSA_HW_EXCEPTION_BLOCK_UNKNOWN = 0xffffffff
|
||||
} HSA_HW_EXCEPTION_BLOCK_ID;
|
||||
|
||||
// data associated with HSA_EVENTTYPE_HW_EXCEPTION
|
||||
typedef struct _HsaHwException
|
||||
{
|
||||
HSA_HW_EXCEPTION_TYPE Type;
|
||||
HSA_HW_EXCEPTION_CAUSE Cause;
|
||||
HSA_HW_EXCEPTION_BLOCK_ID BlockId; // Id of block on where RAS error happens
|
||||
bool MemoryLost;
|
||||
HSAuint32 NodeId; // Id of GPU that has hw exception
|
||||
} HsaHwException;
|
||||
|
||||
typedef struct _HsaEventData
|
||||
{
|
||||
HSA_EVENTTYPE EventType; //event type
|
||||
@@ -976,9 +934,6 @@ typedef struct _HsaEventData
|
||||
// data associated with HSA_EVENTTYPE_MEMORY
|
||||
HsaMemoryAccessFault MemoryAccessFault;
|
||||
|
||||
// data associated with HSA_EVENTTYPE_HW_EXCEPTION
|
||||
HsaHwException HwException;
|
||||
|
||||
} EventData;
|
||||
|
||||
// the following data entries are internal to the KFD & thunk itself.
|
||||
|
||||
@@ -238,28 +238,13 @@ struct kfd_ioctl_dbg_trap_args {
|
||||
|
||||
#define KFD_SIGNAL_EVENT_LIMIT 4096
|
||||
|
||||
/* For kfd_event_data.hw_exception_data.type. */
|
||||
/* For kfd_event_data.hw_exception_data.reset_type. */
|
||||
#define KFD_HW_EXCEPTION_WHOLE_GPU_RESET 0
|
||||
#define KFD_HW_EXCEPTION_PER_ENGINE_RESET 1
|
||||
#define KFD_HW_EXCEPTION_RAS_FATAL 2
|
||||
#define KFD_HW_EXCEPTION_RAS_NOTIFY 3
|
||||
|
||||
/* For kfd_event_data.hw_exception_data.cause. */
|
||||
#define KFD_HW_EXCEPTION_GPU_HANG 0x00000001
|
||||
#define KFD_HW_EXCEPTION_PARITY 0x00000002
|
||||
#define KFD_HW_EXCEPTION_SINGLE_CORRECTABLE 0x00000004
|
||||
#define KFD_HW_EXCEPTION_MULTI_UNCORRECTABLE 0x00000008
|
||||
#define KFD_HW_EXCEPTION_POISON 0x00000010
|
||||
|
||||
/*For kfd_event_data.hw_exception_data.block_id */
|
||||
#define KFD_HW_EXCEPTION_BLOCK_UMC 0
|
||||
#define HSA_HW_EXCEPTION_BLOCK_SDMA 1
|
||||
#define HSA_HW_EXCEPTION_BLOCK_GFXHUB 2
|
||||
#define HSA_HW_EXCEPTION_BLOCK_MMHUB 3
|
||||
#define HSA_HW_EXCEPTION_BLOCK_ATHUB 4
|
||||
#define HSA_HW_EXCEPTION_BLOCK_PCIE_BIF 5
|
||||
#define HSA_HW_EXCEPTION_BLOCK_HDP 6
|
||||
#define HSA_HW_EXCEPTION_BLOCK_UNKNOWN 0xffffffff
|
||||
/* For kfd_event_data.hw_exception_data.reset_cause. */
|
||||
#define KFD_HW_EXCEPTION_GPU_HANG 0
|
||||
#define KFD_HW_EXCEPTION_ECC 1
|
||||
|
||||
|
||||
struct kfd_ioctl_create_event_args {
|
||||
@@ -305,12 +290,10 @@ struct kfd_hsa_memory_exception_data {
|
||||
|
||||
/* hw exception data */
|
||||
struct kfd_hsa_hw_exception_data {
|
||||
__u32 type;
|
||||
__u32 cause;
|
||||
__u32 block_id;
|
||||
__u32 reset_type;
|
||||
__u32 reset_cause;
|
||||
__u32 memory_lost;
|
||||
__u32 gpu_id;
|
||||
__u32 pad;
|
||||
};
|
||||
|
||||
/* Event data */
|
||||
|
||||
@@ -326,26 +326,6 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
|
||||
Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS;
|
||||
analysis_memory_exception(&event_data[i].memory_exception_data);
|
||||
}
|
||||
if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION) {
|
||||
Events[i]->EventData.EventData.HwException.Type =
|
||||
event_data[i].hw_exception_data.type;
|
||||
Events[i]->EventData.EventData.HwException.Cause.Hang =
|
||||
event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_GPU_HANG;
|
||||
Events[i]->EventData.EventData.HwException.Cause.Parity =
|
||||
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_PARITY) >> 1;
|
||||
Events[i]->EventData.EventData.HwException.Cause.SingleCorrectable =
|
||||
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_SINGLE_CORRECTABLE) >> 2;
|
||||
Events[i]->EventData.EventData.HwException.Cause.MultiUncorrectable =
|
||||
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_MULTI_UNCORRECTABLE) >> 3;
|
||||
Events[i]->EventData.EventData.HwException.Cause.Poison =
|
||||
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_POISON) >> 4;
|
||||
Events[i]->EventData.EventData.HwException.BlockId =
|
||||
event_data[i].hw_exception_data.block_id;
|
||||
Events[i]->EventData.EventData.HwException.MemoryLost =
|
||||
event_data[i].hw_exception_data.memory_lost;
|
||||
Events[i]->EventData.EventData.HwException.NodeId =
|
||||
event_data[i].hw_exception_data.gpu_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
out:
|
||||
|
||||
Ссылка в новой задаче
Block a user