[SWDEV-554697] CPER not properly displaying warnings for non-zero partition id's (#687)
* Get primary gpu_id for non-primary partitions.
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
* corrected partitions warning print logic
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: I08be6c78ddd46e5316dc9d538de4908b65b21d43
* Updated patch with latest changes and modified
xgmi partition_id check.
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
* Typo correction
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
* adjusted logging
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: I6d425102d8583aabbcd4d7f55c9c733428524d59
---------
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Oosman Saeed <oossaeed@amd.com>
Co-authored-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: 5398eaa6b3]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
85bcf06edd
Коммит
10bfc7c056
@@ -203,7 +203,7 @@ class AMDSMICommands():
|
||||
# Handle No GPU passed
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
|
||||
# Perform one-time group check. If it fails, record that fact
|
||||
# but do NOT abort—just mark that UUID should be "N/A" later.
|
||||
_group_check_done = False
|
||||
@@ -293,7 +293,7 @@ class AMDSMICommands():
|
||||
self.logger.store_output(args.gpu, 'hsa_id', enumeration_info['hsa_id'])
|
||||
self.logger.store_output(args.gpu, 'hip_id', enumeration_info['hip_id'])
|
||||
self.logger.store_output(args.gpu, 'hip_uuid', enumeration_info['hip_uuid'])
|
||||
|
||||
|
||||
|
||||
if multiple_devices:
|
||||
self.logger.store_multiple_device_output()
|
||||
@@ -446,12 +446,12 @@ class AMDSMICommands():
|
||||
# amd-smi static default arguments:
|
||||
# Exclude args that are not applicable to the current platform,
|
||||
# but allow output if argument is passed.
|
||||
#
|
||||
# Note: Partition is a special case, it is no longer an amd-smi static
|
||||
#
|
||||
# Note: Partition is a special case, it is no longer an amd-smi static
|
||||
# default argument.
|
||||
# Reason: Reading current_compute_partition may momentarily wake the
|
||||
# GPU up. This is due to reading XCD registers, which is expected
|
||||
# behavior. Changing partitions is not a trivial operation,
|
||||
# behavior. Changing partitions is not a trivial operation,
|
||||
# current_compute_partition SYSFS controls this action.
|
||||
if args.partition:
|
||||
current_platform_args += ["partition"]
|
||||
@@ -2344,7 +2344,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
|
||||
if base_board_temp_holder != "N/A":
|
||||
|
||||
|
||||
base_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger,
|
||||
base_board_temp_holder,
|
||||
'\N{DEGREE SIGN}C')
|
||||
@@ -4290,7 +4290,7 @@ class AMDSMICommands():
|
||||
boost_limit = int(boost_limit.split()[0])
|
||||
else:
|
||||
boost_limit = int(boost_limit)
|
||||
|
||||
|
||||
if boost_limit < args.core_boost_limit[0][0]:
|
||||
static_dict["set_core_boost_limit"]["Response"] = f"Max allowed boostlimit is {boost_limit} MHz"
|
||||
elif boost_limit > args.core_boost_limit[0][0]:
|
||||
@@ -5193,7 +5193,7 @@ class AMDSMICommands():
|
||||
if self.helpers.is_amd_hsmp_initialized() and cpu_args_enabled:
|
||||
if args.cpu == None:
|
||||
args.cpu = self.cpu_handles
|
||||
|
||||
|
||||
if self.helpers.is_amd_hsmp_initialized() and core_args_enabled:
|
||||
if args.core == None:
|
||||
args.core = self.core_handles
|
||||
@@ -5484,7 +5484,7 @@ class AMDSMICommands():
|
||||
|
||||
#######################
|
||||
# BM commands - END #
|
||||
#######################
|
||||
#######################
|
||||
|
||||
if args.clean_local_data:
|
||||
try:
|
||||
@@ -5815,7 +5815,7 @@ class AMDSMICommands():
|
||||
# Get Current Power Cap
|
||||
try:
|
||||
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
|
||||
monitor_values['max_power'] = power_cap_info['power_cap'] # Get current power cap (`power_cap`) socket is set to
|
||||
monitor_values['max_power'] = power_cap_info['power_cap'] # Get current power cap (`power_cap`) socket is set to
|
||||
# `max_power_cap`, is the maximum value it can be set to
|
||||
monitor_values['max_power'] = self.helpers.convert_SI_unit(monitor_values['max_power'], AMDSMIHelpers.SI_Unit.MICRO)
|
||||
|
||||
@@ -6429,15 +6429,9 @@ class AMDSMICommands():
|
||||
# Populate the possible gpus and their bdfs
|
||||
xgmi_values = []
|
||||
for gpu in args.gpu:
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(gpu)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", gpu, e.get_error_info())
|
||||
|
||||
if partition_id != 0:
|
||||
logging.debug(f"Skipping xgmi command due to non zero partition {gpu} - {partition_id}")
|
||||
primary_partition = self.helpers.is_primary_partition(gpu)
|
||||
if not primary_partition:
|
||||
logging.debug(f"Skipping xgmi command due to non zero partition {gpu}")
|
||||
continue
|
||||
|
||||
logging.debug("check1 device_handle: %s", gpu)
|
||||
@@ -6497,14 +6491,8 @@ class AMDSMICommands():
|
||||
|
||||
# Populate link metrics
|
||||
for dest_gpu in args.gpu:
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(dest_gpu)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", dest_gpu, e.get_error_info())
|
||||
|
||||
if partition_id != 0:
|
||||
primary_partition = self.helpers.is_primary_partition(dest_gpu)
|
||||
if not primary_partition:
|
||||
continue
|
||||
|
||||
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
|
||||
@@ -7077,6 +7065,36 @@ class AMDSMICommands():
|
||||
args.gpu = [args.gpu]
|
||||
|
||||
args.cursor = [0] * len(args.gpu)
|
||||
|
||||
# Using all the devices given in args.gpu
|
||||
# Populate a list of all the primary partition GPU ids (GPU 0, GPU 1, etc)
|
||||
partition_warning_flag = True
|
||||
primary_partition_gpu_ids = set() # set of all primary partition GPU ids from arg.gpu
|
||||
for device_handle in args.gpu:
|
||||
# First get the partition
|
||||
partition_id = self.helpers.get_partition_id(device_handle)
|
||||
# If there is a single primary partition within args.gpu then we don't need to print the warning
|
||||
if partition_id == 0:
|
||||
partition_warning_flag = False
|
||||
break
|
||||
# Then attempt to get the primary GPU id for that partition
|
||||
primary_partition_gpu_id = self.helpers.get_primary_partition_gpu_id(device_handle)
|
||||
# Add to the set if it's a non-primary partition and we found a valid primary GPU id
|
||||
if partition_id != 0 and primary_partition_gpu_id is not None:
|
||||
primary_partition_gpu_ids.add(primary_partition_gpu_id)
|
||||
|
||||
if partition_warning_flag:
|
||||
# Create a list of the primary partitions
|
||||
primary_partitions_str = " ".join(f"GPU{gpu_id}" for gpu_id in primary_partition_gpu_ids)
|
||||
|
||||
print("WARNING: CPER files are only available on primary partitions")
|
||||
if len(primary_partition_gpu_ids) > 1:
|
||||
print(f"Try with primary partitions {primary_partitions_str}",end="")
|
||||
else:
|
||||
print(f"Try with primary partition {primary_partitions_str}",end="")
|
||||
|
||||
print()
|
||||
|
||||
while True:
|
||||
for idx, device_handle in enumerate(args.gpu):
|
||||
self.helpers.ras_cper(args, device_handle, self.logger, idx)
|
||||
@@ -7274,7 +7292,7 @@ class AMDSMICommands():
|
||||
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
|
||||
except (ValueError, TypeError):
|
||||
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
|
||||
|
||||
|
||||
all_process_list.append(proc_info_dict)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
@@ -1188,7 +1188,7 @@ class AMDSMIHelpers():
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
print(f"WARNING:No CPER files will be dumped unless --folder=<folder_name> is specified and cper entries exist.")
|
||||
print(f"WARNING: No CPER files will be dumped unless --folder=<folder_name> is specified and cper entries exist.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
self._print_header(folder)
|
||||
@@ -1410,6 +1410,43 @@ class AMDSMIHelpers():
|
||||
else:
|
||||
raise ValueError("Unexpected Error getting afids from CPER file") from e
|
||||
|
||||
def get_partition_id(self, device_handle, gpu_id = None) -> int:
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(device_handle)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
return partition_id
|
||||
|
||||
def get_primary_partition_gpu_id(self, device_handle) -> Union[int, None]:
|
||||
try:
|
||||
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(device_handle)
|
||||
if bdf is None:
|
||||
logging.debug("Failed to get device BDF: BDF is None")
|
||||
return None
|
||||
# Construct primary partition BDF (base + ".0" for function 0)
|
||||
primary_bdf = bdf[:10] + ".0"
|
||||
try:
|
||||
primary_device_handle = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(primary_bdf)
|
||||
partition_id = self.get_partition_id(primary_device_handle)
|
||||
if partition_id == 0:
|
||||
return self.get_gpu_id_from_device_handle(primary_device_handle)
|
||||
return None
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get primary partition device handle with BDF %s: %s", primary_bdf, e.get_error_info())
|
||||
return None
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get partition device BDF: %s", e.get_error_info())
|
||||
return None
|
||||
|
||||
def is_primary_partition(self, device_handle, gpu_id = None) -> bool:
|
||||
partition_id = self.get_partition_id(device_handle, gpu_id)
|
||||
if partition_id != 0:
|
||||
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
|
||||
return False
|
||||
return True
|
||||
|
||||
def ras_cper(self, args, device_handle, logger, gpu_idx):
|
||||
# Parse severity mask dynamically from the --severity option.
|
||||
severity_mask = 0
|
||||
@@ -1437,15 +1474,8 @@ class AMDSMIHelpers():
|
||||
print("Press CTRL + C to stop.")
|
||||
self._cper_follow_prompted = True
|
||||
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(device_handle)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if partition_id != 0:
|
||||
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
|
||||
primary_partition = self.is_primary_partition(device_handle, gpu_id)
|
||||
if not primary_partition:
|
||||
return
|
||||
|
||||
if args.folder and not getattr(self, "_cper_folder_prompted", False):
|
||||
|
||||
Ссылка в новой задаче
Block a user