SWDEV-444567 - Added Ring Hang Event

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I2e73ba08ee0004f6f30660b2fa425ea94bafceca


[ROCm/amdsmi commit: 52843152a5]
Этот коммит содержится в:
Maisam Arif
2024-05-02 01:48:14 -05:00
коммит произвёл Maisam Arif
родитель 391db38045
Коммит 1bc244b911
9 изменённых файлов: 35 добавлений и 11 удалений
+7
Просмотреть файл
@@ -8,6 +8,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
### Added
- **Added Ring Hang event**
Added `AMDSMI_EVT_NOTIF_RING_HANG` to the possible events in the `amdsmi_evt_notification_type_t` enum.
- **Added process isolation and clean shader APIs and CLI commands**
Added APIs CLI and APIs to address LeftoverLocals security issues. Allowing clearing the sram data and setting process isolation on a per GPU basis. New APIs:
- `amdsmi_get_gpu_process_isolation()`
@@ -136,6 +139,10 @@ Updates required `amdsmi_get_power_cap_info` to return in uW as originally refle
- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**
Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well.
### Known Issues
- N/A
## amd_smi_lib for ROCm 6.1.1
### Added
+3 -1
Просмотреть файл
@@ -835,13 +835,15 @@ typedef struct {
* Event notification event types
*/
typedef enum {
AMDSMI_EVT_NOTIF_NONE = 0, //!< Not used
AMDSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault
AMDSMI_EVT_NOTIF_FIRST = AMDSMI_EVT_NOTIF_VMFAULT,
AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2,
AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3,
AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4,
AMDSMI_EVT_NOTIF_RING_HANG = 5,
AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_GPU_POST_RESET
AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_RING_HANG
} amdsmi_evt_notification_type_t;
/**
+2 -1
Просмотреть файл
@@ -1145,8 +1145,9 @@ Event Type | Description
---|------
`VMFAULT` | VM page fault
`THERMAL_THROTTLE` | thermal throttle
`GPU_PRE_RESET` | gpu pre reset
`GPU_PRE_RESET` | gpu pre reset
`GPU_POST_RESET` | gpu post reset
`RING_HANG` | ring hang event
#### read
+2
Просмотреть файл
@@ -217,10 +217,12 @@ class AmdSmiCounterCommand(IntEnum):
class AmdSmiEvtNotificationType(IntEnum):
NONE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_NONE
VMFAULT = amdsmi_wrapper.AMDSMI_EVT_NOTIF_VMFAULT
THERMAL_THROTTLE = amdsmi_wrapper.AMDSMI_EVT_NOTIF_THERMAL_THROTTLE
GPU_PRE_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_PRE_RESET
GPU_POST_RESET = amdsmi_wrapper.AMDSMI_EVT_NOTIF_GPU_POST_RESET
RING_HANG = amdsmi_wrapper.AMDSMI_EVT_NOTIF_RING_HANG
class AmdSmiTemperatureMetric(IntEnum):
+7 -2
Просмотреть файл
@@ -1157,19 +1157,23 @@ amdsmi_counter_value_t = struct_amdsmi_counter_value_t
# values for enumeration 'amdsmi_evt_notification_type_t'
amdsmi_evt_notification_type_t__enumvalues = {
0: 'AMDSMI_EVT_NOTIF_NONE',
1: 'AMDSMI_EVT_NOTIF_VMFAULT',
1: 'AMDSMI_EVT_NOTIF_FIRST',
2: 'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE',
3: 'AMDSMI_EVT_NOTIF_GPU_PRE_RESET',
4: 'AMDSMI_EVT_NOTIF_GPU_POST_RESET',
4: 'AMDSMI_EVT_NOTIF_LAST',
5: 'AMDSMI_EVT_NOTIF_RING_HANG',
5: 'AMDSMI_EVT_NOTIF_LAST',
}
AMDSMI_EVT_NOTIF_NONE = 0
AMDSMI_EVT_NOTIF_VMFAULT = 1
AMDSMI_EVT_NOTIF_FIRST = 1
AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2
AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3
AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4
AMDSMI_EVT_NOTIF_LAST = 4
AMDSMI_EVT_NOTIF_RING_HANG = 5
AMDSMI_EVT_NOTIF_LAST = 5
amdsmi_evt_notification_type_t = ctypes.c_uint32 # enum
class struct_amdsmi_evt_notification_data_t(Structure):
pass
@@ -2393,6 +2397,7 @@ __all__ = \
'AMDSMI_EVNT_XGMI_LAST', 'AMDSMI_EVT_NOTIF_FIRST',
'AMDSMI_EVT_NOTIF_GPU_POST_RESET',
'AMDSMI_EVT_NOTIF_GPU_PRE_RESET', 'AMDSMI_EVT_NOTIF_LAST',
'AMDSMI_EVT_NOTIF_NONE', 'AMDSMI_EVT_NOTIF_RING_HANG',
'AMDSMI_EVT_NOTIF_THERMAL_THROTTLE', 'AMDSMI_EVT_NOTIF_VMFAULT',
'AMDSMI_FILE_NOT_FOUND', 'AMDSMI_FREQ_IND_INVALID',
'AMDSMI_FREQ_IND_MAX', 'AMDSMI_FREQ_IND_MIN',
+1
Просмотреть файл
@@ -553,6 +553,7 @@ enum kfd_smi_event {
KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
KFD_SMI_EVENT_GPU_PRE_RESET = 3,
KFD_SMI_EVENT_GPU_POST_RESET = 4,
KFD_SMI_EVENT_RING_HANG = 5,
};
#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
+4 -1
Просмотреть файл
@@ -357,13 +357,16 @@ typedef struct {
* Event notification event types
*/
typedef enum {
RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused
RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT,
RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE,
RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET,
RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET,
RSMI_EVT_NOTIF_RING_HANG = KFD_SMI_EVENT_RING_HANG,
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG
} rsmi_evt_notification_type_t;
/**
+8 -6
Просмотреть файл
@@ -134,16 +134,18 @@ class rsmi_dev_perf_level_t(c_int):
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100
notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET']
notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG']
class rsmi_evt_notification_type_t(c_int):
RSMI_EVT_NOTIF_VMFAULT = 0
RSMI_EVT_NOTIF_NONE = 0
RSMI_EVT_NOTIF_VMFAULT = 1
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 1
RSMI_EVT_NOTIF_GPU_PRE_RESET = 2
RSMI_EVT_NOTIF_GPU_POST_RESET = 3
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2
RSMI_EVT_NOTIF_GPU_PRE_RESET = 3
RSMI_EVT_NOTIF_GPU_POST_RESET = 4
RSMI_EVT_NOTIF_RING_HANG = 5
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG
class rsmi_voltage_metric_t(c_int):
+1
Просмотреть файл
@@ -84,6 +84,7 @@ static const std::map<amdsmi_evt_notification_type_t, const char *>
{AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, "AMDSMI_EVT_NOTIF_THERMAL_THROTTLE"},
{AMDSMI_EVT_NOTIF_GPU_PRE_RESET, "AMDSMI_EVT_NOTIF_GPU_PRE_RESET"},
{AMDSMI_EVT_NOTIF_GPU_POST_RESET, "AMDSMI_EVT_NOTIF_GPU_POST_RESET"},
{AMDSMI_EVT_NOTIF_RING_HANG, "AMDSMI_EVT_NOTIF_RING_HANG"},
};
const char *
NameFromEvtNotifType(amdsmi_evt_notification_type_t evt) {