Revert "libhsakmt: add RAS support"

This reverts commit 1fbe010354.

Change-Id: I739b17e057f2a8a0f4375741955209d2477c704a
Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com>
Этот коммит содержится в:
Eric Huang
2018-12-04 11:15:41 -05:00
коммит произвёл JinHuiEric Huang
родитель 54807526b9
Коммит 29d11d02e8
3 изменённых файлов: 7 добавлений и 89 удалений
+1 -46
Просмотреть файл
@@ -203,8 +203,7 @@ typedef union
unsigned int WaveLaunchTrapOverrideSupported: 1; // Indicates if Wave Launch Trap Override is supported on the node.
unsigned int WaveLaunchModeSupported: 1; // Indicates if Wave Launch Mode is supported on the node.
unsigned int PreciseMemoryOperationsSupported: 1; // Indicates if Precise Memory Operations are supported on the node.
unsigned int RasSupported : 1; // Indicates if GPU RAS feature is enabled
unsigned int Reserved : 12;
unsigned int Reserved : 13;
} ui32;
} HSA_CAPABILITY;
@@ -917,47 +916,6 @@ typedef struct _HsaMemoryAccessFault
HSA_EVENTID_MEMORYFLAGS Flags; // event flags
} HsaMemoryAccessFault;
typedef enum _HSA_HW_EXCEPTION_TYPE
{
HSA_HW_EXCEPTION_TYPE_FULL_GPU_RESET = 0,
HSA_HW_EXCEPTION_TYPE_PER_ENGINE_RESET = 1,
HSA_HW_EXCEPTION_TYPE_RAS_FATAL = 2,
HSA_HW_EXCEPTION_TYPE_RAS_NOTIFY = 3
} HSA_HW_EXCEPTION_TYPE;
typedef struct _HSA_HW_EXCEPTION_CAUSE
{
unsigned int Hang : 1; // GPU hang
unsigned int Parity : 1; // RAS_ERROR_PARITY
unsigned int SingleCorrectable : 1; // ERROR_SINGLE_CORRECTABLE
unsigned int MultiUncorrectable: 1; // ERROR_MULTI_UNCORRECTABLE
unsigned int Poison : 1; // ERROR_POISON
unsigned int Reserved : 27;
} HSA_HW_EXCEPTION_CAUSE;
typedef enum _HSA_HW_EXCEPTION_BLOCK_ID
{
HSA_HW_EXCEPTION_BLOCK_UMC = 0,
HSA_HW_EXCEPTION_BLOCK_SDMA = 1,
HSA_HW_EXCEPTION_BLOCK_GFXHUB = 2,
HSA_HW_EXCEPTION_BLOCK_MMHUB = 3,
HSA_HW_EXCEPTION_BLOCK_ATHUB = 4,
HSA_HW_EXCEPTION_BLOCK_PCIE_BIF = 5,
HSA_HW_EXCEPTION_BLOCK_HDP = 6,
HSA_HW_EXCEPTION_BLOCK_UNKNOWN = 0xffffffff
} HSA_HW_EXCEPTION_BLOCK_ID;
// data associated with HSA_EVENTTYPE_HW_EXCEPTION
typedef struct _HsaHwException
{
HSA_HW_EXCEPTION_TYPE Type;
HSA_HW_EXCEPTION_CAUSE Cause;
HSA_HW_EXCEPTION_BLOCK_ID BlockId; // Id of block on where RAS error happens
bool MemoryLost;
HSAuint32 NodeId; // Id of GPU that has hw exception
} HsaHwException;
typedef struct _HsaEventData
{
HSA_EVENTTYPE EventType; //event type
@@ -976,9 +934,6 @@ typedef struct _HsaEventData
// data associated with HSA_EVENTTYPE_MEMORY
HsaMemoryAccessFault MemoryAccessFault;
// data associated with HSA_EVENTTYPE_HW_EXCEPTION
HsaHwException HwException;
} EventData;
// the following data entries are internal to the KFD & thunk itself.
+6 -23
Просмотреть файл
@@ -238,28 +238,13 @@ struct kfd_ioctl_dbg_trap_args {
#define KFD_SIGNAL_EVENT_LIMIT 4096
/* For kfd_event_data.hw_exception_data.type. */
/* For kfd_event_data.hw_exception_data.reset_type. */
#define KFD_HW_EXCEPTION_WHOLE_GPU_RESET 0
#define KFD_HW_EXCEPTION_PER_ENGINE_RESET 1
#define KFD_HW_EXCEPTION_RAS_FATAL 2
#define KFD_HW_EXCEPTION_RAS_NOTIFY 3
/* For kfd_event_data.hw_exception_data.cause. */
#define KFD_HW_EXCEPTION_GPU_HANG 0x00000001
#define KFD_HW_EXCEPTION_PARITY 0x00000002
#define KFD_HW_EXCEPTION_SINGLE_CORRECTABLE 0x00000004
#define KFD_HW_EXCEPTION_MULTI_UNCORRECTABLE 0x00000008
#define KFD_HW_EXCEPTION_POISON 0x00000010
/*For kfd_event_data.hw_exception_data.block_id */
#define KFD_HW_EXCEPTION_BLOCK_UMC 0
#define HSA_HW_EXCEPTION_BLOCK_SDMA 1
#define HSA_HW_EXCEPTION_BLOCK_GFXHUB 2
#define HSA_HW_EXCEPTION_BLOCK_MMHUB 3
#define HSA_HW_EXCEPTION_BLOCK_ATHUB 4
#define HSA_HW_EXCEPTION_BLOCK_PCIE_BIF 5
#define HSA_HW_EXCEPTION_BLOCK_HDP 6
#define HSA_HW_EXCEPTION_BLOCK_UNKNOWN 0xffffffff
/* For kfd_event_data.hw_exception_data.reset_cause. */
#define KFD_HW_EXCEPTION_GPU_HANG 0
#define KFD_HW_EXCEPTION_ECC 1
struct kfd_ioctl_create_event_args {
@@ -305,12 +290,10 @@ struct kfd_hsa_memory_exception_data {
/* hw exception data */
struct kfd_hsa_hw_exception_data {
__u32 type;
__u32 cause;
__u32 block_id;
__u32 reset_type;
__u32 reset_cause;
__u32 memory_lost;
__u32 gpu_id;
__u32 pad;
};
/* Event data */
-20
Просмотреть файл
@@ -326,26 +326,6 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS;
analysis_memory_exception(&event_data[i].memory_exception_data);
}
if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION) {
Events[i]->EventData.EventData.HwException.Type =
event_data[i].hw_exception_data.type;
Events[i]->EventData.EventData.HwException.Cause.Hang =
event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_GPU_HANG;
Events[i]->EventData.EventData.HwException.Cause.Parity =
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_PARITY) >> 1;
Events[i]->EventData.EventData.HwException.Cause.SingleCorrectable =
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_SINGLE_CORRECTABLE) >> 2;
Events[i]->EventData.EventData.HwException.Cause.MultiUncorrectable =
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_MULTI_UNCORRECTABLE) >> 3;
Events[i]->EventData.EventData.HwException.Cause.Poison =
(event_data[i].hw_exception_data.cause & KFD_HW_EXCEPTION_POISON) >> 4;
Events[i]->EventData.EventData.HwException.BlockId =
event_data[i].hw_exception_data.block_id;
Events[i]->EventData.EventData.HwException.MemoryLost =
event_data[i].hw_exception_data.memory_lost;
Events[i]->EventData.EventData.HwException.NodeId =
event_data[i].hw_exception_data.gpu_id;
}
}
}
out: