[SWDEV-558993] Fix list() groups printout (#772)

* Updated groups printing
* added parameters to check_required_groups
	* two device groups since kfd and render require the same group

---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>

[ROCm/amdsmi commit: ee1445e2cc]
Этот коммит содержится в:
Pryor, Adam
2025-10-16 11:23:49 -05:00
коммит произвёл GitHub
родитель cda730140f
Коммит 1c6147ead5
2 изменённых файлов: 125 добавлений и 41 удалений
+22 -18
Просмотреть файл
@@ -217,7 +217,9 @@ class AMDSMICommands():
if args.gpu == None:
args.gpu = self.device_handles
_group_in_groups = self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
@@ -246,7 +248,7 @@ class AMDSMICommands():
node_id = kfd_info['node_id']
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
kfd_id = node_id = "N/A"
kfd_id = node_id = partition_id = "N/A"
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
# CSV format is intentionally aligned with Host
@@ -273,9 +275,6 @@ class AMDSMICommands():
"hip_uuid": "N/A",
}
# __Override__ hip_uuid if the group check failed
if not _group_in_groups:
enumeration_info["hip_uuid"] = "N/A"
# now store all the fields exactly once:
if enumeration_info['drm_render'] == "N/A":
self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render'])
@@ -455,7 +454,7 @@ class AMDSMICommands():
current_platform_values += [args.partition]
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
if self.helpers.is_linux() and self.helpers.is_baremetal():
@@ -1586,7 +1585,7 @@ class AMDSMICommands():
args.gpu = self.device_handles
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# Handle watch logic, will only enter this block once
@@ -3641,7 +3640,7 @@ class AMDSMICommands():
self.logger.table_header = ''.rjust(12)
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
p2p_status_cache = {}
@@ -4534,6 +4533,10 @@ class AMDSMICommands():
if args.gpu == None:
args.gpu = self.device_handles
if not self.group_check_printed:
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.set_gpu)
if handled_multiple_gpus:
@@ -5072,10 +5075,6 @@ class AMDSMICommands():
if core:
args.core = core
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
@@ -5282,7 +5281,7 @@ class AMDSMICommands():
args.gpu = self.device_handles
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# Handle multiple GPUs
@@ -5652,7 +5651,7 @@ class AMDSMICommands():
args.gpu = self.device_handles
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# If all arguments are False, the print all values
@@ -6414,7 +6413,7 @@ class AMDSMICommands():
self.logger.table_header = ''.rjust(7)
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# Populate the possible gpus and their bdfs
@@ -6753,7 +6752,7 @@ class AMDSMICommands():
args.accelerator = accelerator
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
###########################################
@@ -7118,7 +7117,7 @@ class AMDSMICommands():
message)
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=True)
self.group_check_printed = True
if not args.cper:
@@ -7174,7 +7173,7 @@ class AMDSMICommands():
# check groups first
if not self.group_check_printed:
self.helpers.check_required_groups()
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
processors = amdsmi_interface.amdsmi_get_processor_handles()
@@ -7382,6 +7381,11 @@ class AMDSMICommands():
print("No GPUs on machine")
return
# Check that KFD permissions are available
if not self.group_check_printed:
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
device = devices[i]
listener = amdsmi_interface.AmdSmiEventReader(device,
amdsmi_interface.AmdSmiEvtNotificationType)
+103 -23
Просмотреть файл
@@ -1171,29 +1171,36 @@ class AMDSMIHelpers():
except OSError as e:
return False, e.errno, e.strerror
# Check kfd and dri for EACCES/EPERM
def check_required_groups(self):
def check_required_groups(self, check_render=True, check_video=True):
"""
Check if the current user can access kfd and dri
Specifically, only care for EACCES/EPERM
Args:
check_render (bool): Whether to check /dev/kfd & /dev/dri/renderD* devices. Defaults to True.
check_video (bool): Whether to check /dev/dri/card* devices. Defaults to True.
Returns:
bool: True if all checked devices are accessible, False if any permission errors found
"""
# Skip check if running as root.
if os.geteuid() == 0:
return
return True
paths_to_check = []
if os.path.exists("/dev/kfd"):
# Only add paths for device types that are flagged for checking
if check_render and os.path.exists("/dev/kfd"):
paths_to_check.append("/dev/kfd")
# Render group correspond to /dev/dri/renderD*
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))]
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))]
# Video group corresponds to /dev/dri/card*
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))]
if check_video:
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))]
if not paths_to_check:
return
return True
denied = []
@@ -1206,27 +1213,100 @@ class AMDSMIHelpers():
denied.append((path, err, msg, self._stat_info(path)))
if denied:
# Collect unique group info from denied devices
required_groups = {"kfd": [], "renderD": [], "card": []}
device_types = {"kfd": [], "renderD": [], "card": []}
for path, err, msg, si in denied:
if "error" not in si:
# Categorize devices and collect unique group info
if "/dev/kfd" in path:
device_types["kfd"].append(path)
required_groups["kfd"].append(si)
elif "/dev/dri/renderD" in path:
device_types["renderD"].append(path)
required_groups["renderD"].append(si)
elif "/dev/dri/card" in path:
device_types["card"].append(path)
required_groups["card"].append(si)
# Deduplicate group info by converting to tuple for hashing
for device_type in required_groups:
unique_groups = list(dict.fromkeys(
tuple(sorted(d.items())) for d in required_groups[device_type]
))
required_groups[device_type] = [dict(item) for item in unique_groups]
lines = []
lines.append("Permission needed to access required GPU device node(s):")
for path, err, msg, si in denied:
if "error" in si:
lines.append(f" - {path}: {os.strerror(err)}; stat failed: {si['error']}")
# Collect all unique groups for usermod command
all_groups = set()
# Show summary of denied devices by type with ownership info
if device_types["kfd"]:
lines.append(" • /dev/kfd: Permission denied")
if len(required_groups["kfd"]) > 1:
lines.append(" - Required group(s):")
else:
lines.append(" - Required group:")
for group_info in required_groups["kfd"]:
lines.append(
" - {p}: {err}; owner={user}({uid}):{group}({gid});".format(
p=path,
err=os.strerror(err),
user=si["user"],
uid=si["uid"],
group=si["group"],
gid=si["gid"],
" - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format(
user=group_info["user"],
uid=group_info["uid"],
group=group_info["group"],
gid=group_info["gid"],
)
)
all_groups.add(group_info["group"])
lines.append("")
lines.append("You can try:")
lines.append(" • Add your user to the group that owns these devices:")
lines.append(" sudo usermod -aG <group> \"$USER\"\n")
if device_types["renderD"]:
lines.append(f" • /dev/dri/renderD*: {len(device_types['renderD'])} device(s) denied")
if len(required_groups["renderD"]) > 1:
lines.append(" - Required group(s):")
else:
lines.append(" - Required group:")
for group_info in required_groups["renderD"]:
lines.append(
" - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format(
user=group_info["user"],
uid=group_info["uid"],
group=group_info["group"],
gid=group_info["gid"],
)
)
all_groups.add(group_info["group"])
if device_types["card"]:
lines.append(f" • /dev/dri/card*: {len(device_types['card'])} device(s) denied")
if len(required_groups["card"]) > 1:
lines.append(" - Required group(s):")
else:
lines.append(" - Required group:")
for group_info in required_groups["card"]:
lines.append(
" - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format(
user=group_info["user"],
uid=group_info["uid"],
group=group_info["group"],
gid=group_info["gid"],
)
)
all_groups.add(group_info["group"])
# Generate usermod command with all unique groups
groups_for_usermod = ",".join(sorted(all_groups))
lines.extend([
"",
"To resolve this issue, try the following:",
" • Add your user to the required group(s):",
f" sudo usermod -aG {groups_for_usermod} \"$USER\"",
" • Log out and log back in for the group changes to take effect",
" • Alternatively, run this command with sudo/admin privileges",
""
])
print("\n".join(lines))
return False