[SWDEV-532904] CLI lists unusable UUID without sudo (#510)

Signed-off-by: AL Musaffar, Yazen <Yazen.ALMusaffar@amd.com>

[ROCm/amdsmi commit: 01a6158c85]
Этот коммит содержится в:
AL Musaffar, Yazen
2025-07-09 15:45:03 -05:00
коммит произвёл GitHub
родитель 395a42cafa
Коммит 64bf2d6ae9
2 изменённых файлов: 49 добавлений и 25 удалений
+46 -20
Просмотреть файл
@@ -198,10 +198,20 @@ class AMDSMICommands():
# Handle No GPU passed
if args.gpu == None:
args.gpu = self.device_handles
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
# Perform one-time group check. If it fails, record that fact
# but do NOT abort—just mark that UUID should be "N/A" later.
_group_check_done = False
_group_in_groups = False
if not _group_check_done:
try:
self.helpers.check_required_groups()
_group_in_groups = True
except Exception as e:
_group_in_groups = False
# print the helper's error message exactly once:
print(f"{e}")
_group_check_done = True
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
@@ -213,14 +223,20 @@ class AMDSMICommands():
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
try:
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
bdf = e.get_error_info()
try:
uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
uuid = e.get_error_info()
# Only fetch data if group check passed; otherwise force "N/A"
if _group_in_groups:
try:
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
bdf = "N/A"
try:
uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu)
except amdsmi_exception.AmdSmiLibraryException:
uuid = "N/A"
else:
# user not in render/video → everything is N/A
bdf = "N/A"
uuid = "N/A"
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
@@ -246,23 +262,33 @@ class AMDSMICommands():
if args.e:
try:
enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu)
except:
enumeration_info = {"drm_render": "N/A",
"drm_card": "N/A",
"hip_id": "N/A",
"hip_uuid": "N/A",
"hsa_id": "N/A"}
except amdsmi_exception.AmdSmiLibraryException:
enumeration_info = {
"drm_render": "N/A",
"drm_card": "N/A",
"hsa_id": "N/A",
"hip_id": "N/A",
"hip_uuid": "N/A",
}
# __Override__ hip_uuid if the group check failed
if not _group_in_groups:
enumeration_info["hip_uuid"] = "N/A"
# now store all the fields exactly once:
if enumeration_info['drm_render'] == "N/A":
self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render'])
else:
self.logger.store_output(args.gpu, 'render', f"renderD{enumeration_info['drm_render']}")
self.logger.store_output(args.gpu, 'render',
f"renderD{enumeration_info['drm_render']}")
if enumeration_info['drm_card'] == "N/A":
self.logger.store_output(args.gpu, 'card', enumeration_info['drm_card'])
else:
self.logger.store_output(args.gpu, 'card', f"card{enumeration_info['drm_card']}")
self.logger.store_output(args.gpu, 'card',
f"card{enumeration_info['drm_card']}")
self.logger.store_output(args.gpu, 'hsa_id', enumeration_info['hsa_id'])
self.logger.store_output(args.gpu, 'hip_id', enumeration_info['hip_id'])
self.logger.store_output(args.gpu, 'hip_uuid', enumeration_info['hip_uuid'])
if multiple_devices:
self.logger.store_multiple_device_output()
+3 -5
Просмотреть файл
@@ -1070,8 +1070,7 @@ class AMDSMIHelpers():
try:
user_groups = {grp.getgrgid(gid).gr_name for gid in os.getgroups()}
except Exception as e:
logging.warning("Unable to determine group memberships: %s", e)
return
raise RuntimeError(f"Unable to determine group memberships: {e}")
missing_groups = required_groups - user_groups
if missing_groups:
@@ -1079,8 +1078,7 @@ class AMDSMIHelpers():
"WARNING: User is missing the following required groups: %s. "
"Please add user to these groups."
) % ", ".join(sorted(missing_groups))
print(msg)
logging.warning(msg)
raise RuntimeError(msg)
def _severity_as_string(self, error_severity, notify_type, for_filename):
if error_severity == "non_fatal_uncorrected":
@@ -1345,7 +1343,7 @@ class AMDSMIHelpers():
severity_mask |= (1 << 0)
elif sev in ("nonfatal-corrected", "corrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
severity_mask |= (1 << 2)
severity_mask |= (1 << 2)
buffer_size = 1048576