Fixed post reset and ring_hang issues

Issues include:
	SWDEV-480250
	SWDEV-480255
	SWDEV-480248
Known issue:
	`amd-smi event` has threads taking events from the same device
which, in the case of resetting gpus, makes it seem like some gpus have
reset mulitple times and other have not reset at all.

Signed-off-by: gabrpham <Gabriel.Pham@amd.com>
Change-Id: Ic7dcc214e0366fc1532ece579d915d34d35d5407
Αυτή η υποβολή περιλαμβάνεται σε:
gabrpham
2024-11-14 14:39:59 -06:00
υποβλήθηκε από Maisam Arif
γονέας 1586005a5b
υποβολή bd01cfc203
7 αρχεία άλλαξαν με 1466 προσθήκες και 177 διαγραφές
@@ -5579,7 +5579,14 @@ class AMDSMICommands():
events = listener.read(2000)
for event in events:
values_dict["event"] = event["event"]
values_dict["message"] = event["message"]
# parse message as it's own dictionary
message_list = event["message"].split(" ")
message_dict = {}
for item in message_list:
if not item == "":
item_list = item.split(": ")
message_dict.update({item_list[0]: item_list[1]})
values_dict["message"] = message_dict
commands.logger.store_output(device, 'values', values_dict)
commands.logger.print_output()
except amdsmi_exception.AmdSmiLibraryException as e:
+2 -1
Προβολή Αρχείου
@@ -961,7 +961,8 @@ typedef enum {
#define AMDSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
//! Maximum number of characters an event notification message will be
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 64
// matches kfd message max size
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96
/**
* Event notification data returned from event notification API
@@ -69,7 +69,7 @@ AMDSMI_MAX_CACHE_TYPES = 10
AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK = 64
AMDSMI_GPU_UUID_SIZE = 38
MAX_AMDSMI_NAME_LENGTH = 64
MAX_EVENT_NOTIFICATION_MSG_SIZE = 64
MAX_EVENT_NOTIFICATION_MSG_SIZE = 96
class AmdSmiInitFlags(IntEnum):
@@ -1300,7 +1300,7 @@ struct_amdsmi_evt_notification_data_t._pack_ = 1 # source:False
struct_amdsmi_evt_notification_data_t._fields_ = [
('processor_handle', ctypes.POINTER(None)),
('event', amdsmi_evt_notification_type_t),
('message', ctypes.c_char * 64),
('message', ctypes.c_char * 96),
('PADDING_0', ctypes.c_ubyte * 4),
]
@@ -1741,6 +1741,7 @@ struct_amdsmi_gpu_xcp_metrics_t._fields_ = [
('gfx_busy_acc', ctypes.c_uint64 * 8),
]
amdsmi_gpu_xcp_metrics_t = struct_amdsmi_gpu_xcp_metrics_t
class struct_amdsmi_gpu_metrics_t(Structure):
pass
@@ -2869,11 +2870,12 @@ __all__ = \
'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter',
'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter',
'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t',
'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status',
'amdsmi_hsmp_freqlimit_src_names', 'amdsmi_hsmp_metrics_table_t',
'amdsmi_init', 'amdsmi_init_flags_t',
'amdsmi_init_gpu_event_notification', 'amdsmi_io_bw_encoding_t',
'amdsmi_io_link_type_t', 'amdsmi_is_P2P_accessible',
'amdsmi_gpu_read_counter', 'amdsmi_gpu_xcp_metrics_t',
'amdsmi_gpu_xgmi_error_status', 'amdsmi_hsmp_freqlimit_src_names',
'amdsmi_hsmp_metrics_table_t', 'amdsmi_init',
'amdsmi_init_flags_t', 'amdsmi_init_gpu_event_notification',
'amdsmi_io_bw_encoding_t', 'amdsmi_io_link_type_t',
'amdsmi_is_P2P_accessible',
'amdsmi_is_gpu_power_management_enabled', 'amdsmi_kfd_info_t',
'amdsmi_link_id_bw_type_t', 'amdsmi_link_metrics_t',
'amdsmi_link_type_t', 'amdsmi_memory_page_status_t',
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο Φόρτωση Διαφορών
@@ -344,9 +344,15 @@ typedef enum {
RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE,
RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET,
RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET,
RSMI_EVT_NOTIF_RING_HANG = KFD_SMI_EVENT_RING_HANG,
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG
RSMI_EVT_NOTIF_EVENT_MIGRATE_START = KFD_SMI_EVENT_MIGRATE_START,
RSMI_EVT_NOTIF_EVENT_MIGRATE_END = KFD_SMI_EVENT_MIGRATE_END,
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = KFD_SMI_EVENT_PAGE_FAULT_START,
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = KFD_SMI_EVENT_PAGE_FAULT_END,
RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION,
RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE,
RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU,
RSMI_EVT_NOTIF_EVENT_ALL_PROCESS = KFD_SMI_EVENT_ALL_PROCESS,
RSMI_EVT_NOTIF_LAST = KFD_SMI_EVENT_ALL_PROCESS
} rsmi_evt_notification_type_t;
/**
@@ -355,7 +361,8 @@ typedef enum {
#define RSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
//! Maximum number of characters an event notification message will be
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 64
// matches kfd message max size
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96
/**
* Event notification data returned from event notification API
@@ -1264,7 +1271,7 @@ typedef struct {
/**
* Accumulated throttler residencies
*
* Socket (thermal) -
* Socket (thermal) -
* Socket thermal violation % (greater than 0% is a violation);
* aka TVIOL
*
+217 -3
Προβολή Αρχείου
@@ -6618,16 +6618,230 @@ rsmi_event_notification_get(int timeout_ms,
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);
uint32_t event;
while (fscanf(anon_fp, "%x %63s\n", &event,
reinterpret_cast<char *>(&data_item->message)) == 2) {
/* Output is in format as "event information\n"
char event_in[MAX_EVENT_NOTIFICATION_MSG_SIZE];
memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
while (fgets(event_in, MAX_EVENT_NOTIFICATION_MSG_SIZE, anon_fp)) {
/* Output is in format as "event_number message_information\n"
* Both event are expressed in hex.
* information is a string
*/
char message[MAX_EVENT_NOTIFICATION_MSG_SIZE];
// parse the line here for event_number and rest of message_information
sscanf(event_in, "%x %[^\n]\n", &event, message);
// parse message based on event received
switch (event){
case RSMI_EVT_NOTIF_NONE:
strcpy(reinterpret_cast<char *>(&data_item->message), "Event type None received");
break;
case RSMI_EVT_NOTIF_VMFAULT:
{
uint32_t pid;
char task_name[MAX_EVENT_NOTIFICATION_MSG_SIZE];
memcpy(reinterpret_cast<char *>(task_name), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
sscanf(message, "%x:%s\n", &pid, task_name);
std::stringstream final_message;
final_message << "PID: " << std::to_string(pid).c_str()
<< " task name: " << task_name;
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_THERMAL_THROTTLE:
{
uint64_t bitmask;
uint64_t counter;
sscanf(message, "%llx:%llx\n", &bitmask, &counter);
std::stringstream final_message;
final_message << "bitmask: 0x" << std::hex << bitmask
<< " counter: 0x" << std::hex << counter;
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_GPU_PRE_RESET:
{
uint32_t reset_seq_num;
char reset_cause[MAX_EVENT_NOTIFICATION_MSG_SIZE];
memcpy(reinterpret_cast<char *>(reset_cause), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
sscanf(message, "%x %[^\n]\n", &reset_seq_num, reset_cause);
std::stringstream final_message;
final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str()
<< " reset cause: " << reset_cause;
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_GPU_POST_RESET:
{
uint32_t reset_seq_num;
sscanf(message, "%x %[^\n]\n", &reset_seq_num);
std::stringstream final_message;
final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str();
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_MIGRATE_START:
{
int64_t ns;
int32_t pid;
uint32_t start;
uint32_t size;
uint16_t from;
uint16_t to;
uint16_t prefetch_loc;
uint16_t preferred_loc;
int32_t migrate_trigger;
sscanf(message, "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", &ns, &pid, &start, &size, &from, &to, &prefetch_loc, &preferred_loc, &migrate_trigger);
std::stringstream final_message;
final_message << "nd: " << std::to_string(ns).c_str()
<< " pid: " << std::to_string(pid).c_str()
<< " start: 0x" << std::hex << start
<< " size: 0x" << std::hex << size
<< " from: 0x" << std::hex << from
<< " to: 0x" << std::hex << to
<< " prefetch_loc: 0x" << std::hex << prefetch_loc
<< " preferred_loc: 0x" << std::hex << preferred_loc
<< " migrate_trigger: " << std::to_string(migrate_trigger).c_str();
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_MIGRATE_END:
{
int64_t ns;
int32_t pid;
uint32_t start;
uint32_t size;
uint32_t from;
uint32_t to;
uint32_t migrate_trigger;
uint32_t error_code;
sscanf(message, "%lld -%d @%lx(%lx) %x->%x %d %d\n", &ns, &pid, &start, &size, &from, &to, &migrate_trigger, &error_code);
std::stringstream final_message;
final_message << "nd: " << std::to_string(ns).c_str()
<< " pid: " << std::to_string(pid).c_str()
<< " start: 0x" << std::hex << start
<< " size: 0x" << std::hex << size
<< " from: 0x" << std::hex << from
<< " to: 0x" << std::hex << to
<< " migrate_trigger: " << std::to_string(migrate_trigger).c_str()
<< " error_code: " << std::to_string(error_code).c_str();
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START:
{
int64_t ns;
int32_t pid;
uint32_t addr;
uint32_t node;
char *rw;
sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, rw);
std::stringstream final_message;
final_message << "ns: " << std::to_string(ns).c_str()
<< " pid: " << std::to_string(pid).c_str()
<< " addr: 0x" << std::hex << addr
<< " node: 0x" << std::hex << node
<< " rw: " << rw;
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END:
{
int64_t ns;
int32_t pid;
uint32_t addr;
uint32_t node;
char *migrate_update;
sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, migrate_update);
std::stringstream final_message;
final_message << "ns: " << std::to_string(ns).c_str()
<< " pid: " << std::to_string(pid).c_str()
<< " addr: 0x" << std::hex << addr
<< " node: 0x" << std::hex << node
<< " migrate_udpate: " << migrate_update;
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION:
{
int64_t ns;
int32_t pid;
uint32_t node;
uint32_t evict_trigger;
sscanf(message, "%lld -%d %x %d\n", &ns, &pid, &node, &evict_trigger);
std::stringstream final_message;
final_message << "ns: " << std::to_string(ns).c_str()
<< " pid: " << std::to_string(pid).c_str()
<< " node: 0x" << std::hex << node
<< " evict_trigger: " << std::to_string(evict_trigger).c_str();
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE:
{
int64_t ns;
int32_t pid;
uint32_t node;
char *rescheduled;
sscanf(message, "%lld -%d %x %c\n", &ns, &pid, &node, rescheduled);
std::stringstream final_message;
final_message << "ns: " << std::to_string(ns).c_str()
<< " pid: " << std::to_string(pid).c_str()
<< " node: 0x" << std::hex << node
<< " rescheduled: " << rescheduled;
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
case RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU:
{
int64_t ns;
int32_t pid;
uint32_t addr;
uint32_t size;
uint32_t node;
uint32_t unmap_trigger;
sscanf(message, "%lld -%d @%lx(%lx) %x %d\n", &ns, &pid, &addr, &size, &node, &unmap_trigger);
std::stringstream final_message;
final_message << "ns: " << std::to_string(ns).c_str()
<< " pid: " << std::to_string(pid).c_str()
<< " addr: 0x" <<std::hex << addr
<< " size: 0x" <<std::hex << size
<< " node: 0x" << std::hex << node
<< " unmap_trigger: " << std::to_string(unmap_trigger).c_str();
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
}
break;
default:
strcpy(reinterpret_cast<char *>(&data_item->message), "Unknown event received");
break;
}
data_item->event = (rsmi_evt_notification_type_t)event;
data_item->dv_ind = fd_indx_to_dev_id[i];
++(*num_elem);
// zero out event_in after each use
memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
if (*num_elem >= buffer_size) {
break;
}