Fixed post reset and ring_hang issues
Issues include: SWDEV-480250 SWDEV-480255 SWDEV-480248 Known issue: `amd-smi event` has threads taking events from the same device which, in the case of resetting gpus, makes it seem like some gpus have reset mulitple times and other have not reset at all. Signed-off-by: gabrpham <Gabriel.Pham@amd.com> Change-Id: Ic7dcc214e0366fc1532ece579d915d34d35d5407
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -5579,7 +5579,14 @@ class AMDSMICommands():
|
||||
events = listener.read(2000)
|
||||
for event in events:
|
||||
values_dict["event"] = event["event"]
|
||||
values_dict["message"] = event["message"]
|
||||
# parse message as it's own dictionary
|
||||
message_list = event["message"].split(" ")
|
||||
message_dict = {}
|
||||
for item in message_list:
|
||||
if not item == "":
|
||||
item_list = item.split(": ")
|
||||
message_dict.update({item_list[0]: item_list[1]})
|
||||
values_dict["message"] = message_dict
|
||||
commands.logger.store_output(device, 'values', values_dict)
|
||||
commands.logger.print_output()
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
|
||||
@@ -961,7 +961,8 @@ typedef enum {
|
||||
#define AMDSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
|
||||
|
||||
//! Maximum number of characters an event notification message will be
|
||||
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 64
|
||||
// matches kfd message max size
|
||||
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96
|
||||
|
||||
/**
|
||||
* Event notification data returned from event notification API
|
||||
|
||||
@@ -69,7 +69,7 @@ AMDSMI_MAX_CACHE_TYPES = 10
|
||||
AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK = 64
|
||||
AMDSMI_GPU_UUID_SIZE = 38
|
||||
MAX_AMDSMI_NAME_LENGTH = 64
|
||||
MAX_EVENT_NOTIFICATION_MSG_SIZE = 64
|
||||
MAX_EVENT_NOTIFICATION_MSG_SIZE = 96
|
||||
|
||||
|
||||
class AmdSmiInitFlags(IntEnum):
|
||||
|
||||
@@ -1300,7 +1300,7 @@ struct_amdsmi_evt_notification_data_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_evt_notification_data_t._fields_ = [
|
||||
('processor_handle', ctypes.POINTER(None)),
|
||||
('event', amdsmi_evt_notification_type_t),
|
||||
('message', ctypes.c_char * 64),
|
||||
('message', ctypes.c_char * 96),
|
||||
('PADDING_0', ctypes.c_ubyte * 4),
|
||||
]
|
||||
|
||||
@@ -1741,6 +1741,7 @@ struct_amdsmi_gpu_xcp_metrics_t._fields_ = [
|
||||
('gfx_busy_acc', ctypes.c_uint64 * 8),
|
||||
]
|
||||
|
||||
amdsmi_gpu_xcp_metrics_t = struct_amdsmi_gpu_xcp_metrics_t
|
||||
class struct_amdsmi_gpu_metrics_t(Structure):
|
||||
pass
|
||||
|
||||
@@ -2869,11 +2870,12 @@ __all__ = \
|
||||
'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter',
|
||||
'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter',
|
||||
'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t',
|
||||
'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status',
|
||||
'amdsmi_hsmp_freqlimit_src_names', 'amdsmi_hsmp_metrics_table_t',
|
||||
'amdsmi_init', 'amdsmi_init_flags_t',
|
||||
'amdsmi_init_gpu_event_notification', 'amdsmi_io_bw_encoding_t',
|
||||
'amdsmi_io_link_type_t', 'amdsmi_is_P2P_accessible',
|
||||
'amdsmi_gpu_read_counter', 'amdsmi_gpu_xcp_metrics_t',
|
||||
'amdsmi_gpu_xgmi_error_status', 'amdsmi_hsmp_freqlimit_src_names',
|
||||
'amdsmi_hsmp_metrics_table_t', 'amdsmi_init',
|
||||
'amdsmi_init_flags_t', 'amdsmi_init_gpu_event_notification',
|
||||
'amdsmi_io_bw_encoding_t', 'amdsmi_io_link_type_t',
|
||||
'amdsmi_is_P2P_accessible',
|
||||
'amdsmi_is_gpu_power_management_enabled', 'amdsmi_kfd_info_t',
|
||||
'amdsmi_link_id_bw_type_t', 'amdsmi_link_metrics_t',
|
||||
'amdsmi_link_type_t', 'amdsmi_memory_page_status_t',
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -344,9 +344,15 @@ typedef enum {
|
||||
RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE,
|
||||
RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET,
|
||||
RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET,
|
||||
RSMI_EVT_NOTIF_RING_HANG = KFD_SMI_EVENT_RING_HANG,
|
||||
|
||||
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG
|
||||
RSMI_EVT_NOTIF_EVENT_MIGRATE_START = KFD_SMI_EVENT_MIGRATE_START,
|
||||
RSMI_EVT_NOTIF_EVENT_MIGRATE_END = KFD_SMI_EVENT_MIGRATE_END,
|
||||
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START = KFD_SMI_EVENT_PAGE_FAULT_START,
|
||||
RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END = KFD_SMI_EVENT_PAGE_FAULT_END,
|
||||
RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION = KFD_SMI_EVENT_QUEUE_EVICTION,
|
||||
RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE = KFD_SMI_EVENT_QUEUE_RESTORE,
|
||||
RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU = KFD_SMI_EVENT_UNMAP_FROM_GPU,
|
||||
RSMI_EVT_NOTIF_EVENT_ALL_PROCESS = KFD_SMI_EVENT_ALL_PROCESS,
|
||||
RSMI_EVT_NOTIF_LAST = KFD_SMI_EVENT_ALL_PROCESS
|
||||
} rsmi_evt_notification_type_t;
|
||||
|
||||
/**
|
||||
@@ -355,7 +361,8 @@ typedef enum {
|
||||
#define RSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
|
||||
|
||||
//! Maximum number of characters an event notification message will be
|
||||
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 64
|
||||
// matches kfd message max size
|
||||
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96
|
||||
|
||||
/**
|
||||
* Event notification data returned from event notification API
|
||||
@@ -1264,7 +1271,7 @@ typedef struct {
|
||||
/**
|
||||
* Accumulated throttler residencies
|
||||
*
|
||||
* Socket (thermal) -
|
||||
* Socket (thermal) -
|
||||
* Socket thermal violation % (greater than 0% is a violation);
|
||||
* aka TVIOL
|
||||
*
|
||||
|
||||
@@ -6618,16 +6618,230 @@ rsmi_event_notification_get(int timeout_ms,
|
||||
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);
|
||||
|
||||
uint32_t event;
|
||||
while (fscanf(anon_fp, "%x %63s\n", &event,
|
||||
reinterpret_cast<char *>(&data_item->message)) == 2) {
|
||||
/* Output is in format as "event information\n"
|
||||
char event_in[MAX_EVENT_NOTIFICATION_MSG_SIZE];
|
||||
memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
|
||||
while (fgets(event_in, MAX_EVENT_NOTIFICATION_MSG_SIZE, anon_fp)) {
|
||||
/* Output is in format as "event_number message_information\n"
|
||||
* Both event are expressed in hex.
|
||||
* information is a string
|
||||
*/
|
||||
char message[MAX_EVENT_NOTIFICATION_MSG_SIZE];
|
||||
// parse the line here for event_number and rest of message_information
|
||||
sscanf(event_in, "%x %[^\n]\n", &event, message);
|
||||
|
||||
// parse message based on event received
|
||||
switch (event){
|
||||
case RSMI_EVT_NOTIF_NONE:
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), "Event type None received");
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_VMFAULT:
|
||||
{
|
||||
uint32_t pid;
|
||||
char task_name[MAX_EVENT_NOTIFICATION_MSG_SIZE];
|
||||
memcpy(reinterpret_cast<char *>(task_name), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
|
||||
|
||||
sscanf(message, "%x:%s\n", &pid, task_name);
|
||||
std::stringstream final_message;
|
||||
final_message << "PID: " << std::to_string(pid).c_str()
|
||||
<< " task name: " << task_name;
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_THERMAL_THROTTLE:
|
||||
{
|
||||
uint64_t bitmask;
|
||||
uint64_t counter;
|
||||
|
||||
sscanf(message, "%llx:%llx\n", &bitmask, &counter);
|
||||
std::stringstream final_message;
|
||||
final_message << "bitmask: 0x" << std::hex << bitmask
|
||||
<< " counter: 0x" << std::hex << counter;
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_GPU_PRE_RESET:
|
||||
{
|
||||
uint32_t reset_seq_num;
|
||||
char reset_cause[MAX_EVENT_NOTIFICATION_MSG_SIZE];
|
||||
memcpy(reinterpret_cast<char *>(reset_cause), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
|
||||
|
||||
sscanf(message, "%x %[^\n]\n", &reset_seq_num, reset_cause);
|
||||
std::stringstream final_message;
|
||||
final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str()
|
||||
<< " reset cause: " << reset_cause;
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_GPU_POST_RESET:
|
||||
{
|
||||
uint32_t reset_seq_num;
|
||||
|
||||
sscanf(message, "%x %[^\n]\n", &reset_seq_num);
|
||||
std::stringstream final_message;
|
||||
final_message << "reset sequence number: " << std::to_string(reset_seq_num).c_str();
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_EVENT_MIGRATE_START:
|
||||
{
|
||||
int64_t ns;
|
||||
int32_t pid;
|
||||
uint32_t start;
|
||||
uint32_t size;
|
||||
uint16_t from;
|
||||
uint16_t to;
|
||||
uint16_t prefetch_loc;
|
||||
uint16_t preferred_loc;
|
||||
int32_t migrate_trigger;
|
||||
|
||||
sscanf(message, "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", &ns, &pid, &start, &size, &from, &to, &prefetch_loc, &preferred_loc, &migrate_trigger);
|
||||
std::stringstream final_message;
|
||||
final_message << "nd: " << std::to_string(ns).c_str()
|
||||
<< " pid: " << std::to_string(pid).c_str()
|
||||
<< " start: 0x" << std::hex << start
|
||||
<< " size: 0x" << std::hex << size
|
||||
<< " from: 0x" << std::hex << from
|
||||
<< " to: 0x" << std::hex << to
|
||||
<< " prefetch_loc: 0x" << std::hex << prefetch_loc
|
||||
<< " preferred_loc: 0x" << std::hex << preferred_loc
|
||||
<< " migrate_trigger: " << std::to_string(migrate_trigger).c_str();
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_EVENT_MIGRATE_END:
|
||||
{
|
||||
int64_t ns;
|
||||
int32_t pid;
|
||||
uint32_t start;
|
||||
uint32_t size;
|
||||
uint32_t from;
|
||||
uint32_t to;
|
||||
uint32_t migrate_trigger;
|
||||
uint32_t error_code;
|
||||
|
||||
sscanf(message, "%lld -%d @%lx(%lx) %x->%x %d %d\n", &ns, &pid, &start, &size, &from, &to, &migrate_trigger, &error_code);
|
||||
std::stringstream final_message;
|
||||
final_message << "nd: " << std::to_string(ns).c_str()
|
||||
<< " pid: " << std::to_string(pid).c_str()
|
||||
<< " start: 0x" << std::hex << start
|
||||
<< " size: 0x" << std::hex << size
|
||||
<< " from: 0x" << std::hex << from
|
||||
<< " to: 0x" << std::hex << to
|
||||
<< " migrate_trigger: " << std::to_string(migrate_trigger).c_str()
|
||||
<< " error_code: " << std::to_string(error_code).c_str();
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START:
|
||||
{
|
||||
int64_t ns;
|
||||
int32_t pid;
|
||||
uint32_t addr;
|
||||
uint32_t node;
|
||||
char *rw;
|
||||
|
||||
sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, rw);
|
||||
std::stringstream final_message;
|
||||
final_message << "ns: " << std::to_string(ns).c_str()
|
||||
<< " pid: " << std::to_string(pid).c_str()
|
||||
<< " addr: 0x" << std::hex << addr
|
||||
<< " node: 0x" << std::hex << node
|
||||
<< " rw: " << rw;
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END:
|
||||
{
|
||||
int64_t ns;
|
||||
int32_t pid;
|
||||
uint32_t addr;
|
||||
uint32_t node;
|
||||
char *migrate_update;
|
||||
|
||||
sscanf(message, "%lld -%d @%lx(%x) %c\n", &ns, &pid, &addr, &node, migrate_update);
|
||||
std::stringstream final_message;
|
||||
final_message << "ns: " << std::to_string(ns).c_str()
|
||||
<< " pid: " << std::to_string(pid).c_str()
|
||||
<< " addr: 0x" << std::hex << addr
|
||||
<< " node: 0x" << std::hex << node
|
||||
<< " migrate_udpate: " << migrate_update;
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION:
|
||||
{
|
||||
int64_t ns;
|
||||
int32_t pid;
|
||||
uint32_t node;
|
||||
uint32_t evict_trigger;
|
||||
|
||||
sscanf(message, "%lld -%d %x %d\n", &ns, &pid, &node, &evict_trigger);
|
||||
std::stringstream final_message;
|
||||
final_message << "ns: " << std::to_string(ns).c_str()
|
||||
<< " pid: " << std::to_string(pid).c_str()
|
||||
<< " node: 0x" << std::hex << node
|
||||
<< " evict_trigger: " << std::to_string(evict_trigger).c_str();
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE:
|
||||
{
|
||||
int64_t ns;
|
||||
int32_t pid;
|
||||
uint32_t node;
|
||||
char *rescheduled;
|
||||
|
||||
sscanf(message, "%lld -%d %x %c\n", &ns, &pid, &node, rescheduled);
|
||||
std::stringstream final_message;
|
||||
final_message << "ns: " << std::to_string(ns).c_str()
|
||||
<< " pid: " << std::to_string(pid).c_str()
|
||||
<< " node: 0x" << std::hex << node
|
||||
<< " rescheduled: " << rescheduled;
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
case RSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU:
|
||||
{
|
||||
int64_t ns;
|
||||
int32_t pid;
|
||||
uint32_t addr;
|
||||
uint32_t size;
|
||||
uint32_t node;
|
||||
uint32_t unmap_trigger;
|
||||
|
||||
sscanf(message, "%lld -%d @%lx(%lx) %x %d\n", &ns, &pid, &addr, &size, &node, &unmap_trigger);
|
||||
std::stringstream final_message;
|
||||
final_message << "ns: " << std::to_string(ns).c_str()
|
||||
<< " pid: " << std::to_string(pid).c_str()
|
||||
<< " addr: 0x" <<std::hex << addr
|
||||
<< " size: 0x" <<std::hex << size
|
||||
<< " node: 0x" << std::hex << node
|
||||
<< " unmap_trigger: " << std::to_string(unmap_trigger).c_str();
|
||||
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), final_message.str().c_str());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
strcpy(reinterpret_cast<char *>(&data_item->message), "Unknown event received");
|
||||
break;
|
||||
}
|
||||
data_item->event = (rsmi_evt_notification_type_t)event;
|
||||
data_item->dv_ind = fd_indx_to_dev_id[i];
|
||||
++(*num_elem);
|
||||
|
||||
// zero out event_in after each use
|
||||
memcpy(reinterpret_cast<char *>(event_in), "\0", MAX_EVENT_NOTIFICATION_MSG_SIZE);
|
||||
|
||||
if (*num_elem >= buffer_size) {
|
||||
break;
|
||||
}
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user