[SWDEV-558993] Fix list() groups printout (#772)
* Updated groups printing
* added parameters to check_required_groups
* two device groups since kfd and render require the same group
---------
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: ee1445e2cc]
Этот коммит содержится в:
@@ -217,7 +217,9 @@ class AMDSMICommands():
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
_group_in_groups = self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
# Handle multiple GPUs
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
|
||||
@@ -246,7 +248,7 @@ class AMDSMICommands():
|
||||
node_id = kfd_info['node_id']
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
kfd_id = node_id = "N/A"
|
||||
kfd_id = node_id = partition_id = "N/A"
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# CSV format is intentionally aligned with Host
|
||||
@@ -273,9 +275,6 @@ class AMDSMICommands():
|
||||
"hip_uuid": "N/A",
|
||||
}
|
||||
|
||||
# __Override__ hip_uuid if the group check failed
|
||||
if not _group_in_groups:
|
||||
enumeration_info["hip_uuid"] = "N/A"
|
||||
# now store all the fields exactly once:
|
||||
if enumeration_info['drm_render'] == "N/A":
|
||||
self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render'])
|
||||
@@ -455,7 +454,7 @@ class AMDSMICommands():
|
||||
current_platform_values += [args.partition]
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
@@ -1586,7 +1585,7 @@ class AMDSMICommands():
|
||||
args.gpu = self.device_handles
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
# Handle watch logic, will only enter this block once
|
||||
@@ -3641,7 +3640,7 @@ class AMDSMICommands():
|
||||
self.logger.table_header = ''.rjust(12)
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
p2p_status_cache = {}
|
||||
@@ -4534,6 +4533,10 @@ class AMDSMICommands():
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
# Handle multiple GPUs
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.set_gpu)
|
||||
if handled_multiple_gpus:
|
||||
@@ -5072,10 +5075,6 @@ class AMDSMICommands():
|
||||
if core:
|
||||
args.core = core
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
# Check if a GPU argument has been set
|
||||
gpu_args_enabled = False
|
||||
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
|
||||
@@ -5282,7 +5281,7 @@ class AMDSMICommands():
|
||||
args.gpu = self.device_handles
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
# Handle multiple GPUs
|
||||
@@ -5652,7 +5651,7 @@ class AMDSMICommands():
|
||||
args.gpu = self.device_handles
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
# If all arguments are False, the print all values
|
||||
@@ -6414,7 +6413,7 @@ class AMDSMICommands():
|
||||
self.logger.table_header = ''.rjust(7)
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
# Populate the possible gpus and their bdfs
|
||||
@@ -6753,7 +6752,7 @@ class AMDSMICommands():
|
||||
args.accelerator = accelerator
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
###########################################
|
||||
@@ -7118,7 +7117,7 @@ class AMDSMICommands():
|
||||
message)
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=True)
|
||||
self.group_check_printed = True
|
||||
|
||||
if not args.cper:
|
||||
@@ -7174,7 +7173,7 @@ class AMDSMICommands():
|
||||
|
||||
# check groups first
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
processors = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
@@ -7382,6 +7381,11 @@ class AMDSMICommands():
|
||||
print("No GPUs on machine")
|
||||
return
|
||||
|
||||
# Check that KFD permissions are available
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
device = devices[i]
|
||||
listener = amdsmi_interface.AmdSmiEventReader(device,
|
||||
amdsmi_interface.AmdSmiEvtNotificationType)
|
||||
|
||||
@@ -1171,29 +1171,36 @@ class AMDSMIHelpers():
|
||||
except OSError as e:
|
||||
return False, e.errno, e.strerror
|
||||
|
||||
# Check kfd and dri for EACCES/EPERM
|
||||
def check_required_groups(self):
|
||||
def check_required_groups(self, check_render=True, check_video=True):
|
||||
"""
|
||||
Check if the current user can access kfd and dri
|
||||
Specifically, only care for EACCES/EPERM
|
||||
|
||||
Args:
|
||||
check_render (bool): Whether to check /dev/kfd & /dev/dri/renderD* devices. Defaults to True.
|
||||
check_video (bool): Whether to check /dev/dri/card* devices. Defaults to True.
|
||||
|
||||
Returns:
|
||||
bool: True if all checked devices are accessible, False if any permission errors found
|
||||
"""
|
||||
|
||||
# Skip check if running as root.
|
||||
if os.geteuid() == 0:
|
||||
return
|
||||
return True
|
||||
|
||||
paths_to_check = []
|
||||
if os.path.exists("/dev/kfd"):
|
||||
|
||||
# Only add paths for device types that are flagged for checking
|
||||
if check_render and os.path.exists("/dev/kfd"):
|
||||
paths_to_check.append("/dev/kfd")
|
||||
|
||||
# Render group correspond to /dev/dri/renderD*
|
||||
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))]
|
||||
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))]
|
||||
|
||||
# Video group corresponds to /dev/dri/card*
|
||||
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))]
|
||||
if check_video:
|
||||
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))]
|
||||
|
||||
if not paths_to_check:
|
||||
return
|
||||
return True
|
||||
|
||||
denied = []
|
||||
|
||||
@@ -1206,27 +1213,100 @@ class AMDSMIHelpers():
|
||||
denied.append((path, err, msg, self._stat_info(path)))
|
||||
|
||||
if denied:
|
||||
# Collect unique group info from denied devices
|
||||
required_groups = {"kfd": [], "renderD": [], "card": []}
|
||||
device_types = {"kfd": [], "renderD": [], "card": []}
|
||||
|
||||
for path, err, msg, si in denied:
|
||||
if "error" not in si:
|
||||
# Categorize devices and collect unique group info
|
||||
if "/dev/kfd" in path:
|
||||
device_types["kfd"].append(path)
|
||||
required_groups["kfd"].append(si)
|
||||
elif "/dev/dri/renderD" in path:
|
||||
device_types["renderD"].append(path)
|
||||
required_groups["renderD"].append(si)
|
||||
elif "/dev/dri/card" in path:
|
||||
device_types["card"].append(path)
|
||||
required_groups["card"].append(si)
|
||||
|
||||
# Deduplicate group info by converting to tuple for hashing
|
||||
for device_type in required_groups:
|
||||
unique_groups = list(dict.fromkeys(
|
||||
tuple(sorted(d.items())) for d in required_groups[device_type]
|
||||
))
|
||||
required_groups[device_type] = [dict(item) for item in unique_groups]
|
||||
|
||||
lines = []
|
||||
lines.append("Permission needed to access required GPU device node(s):")
|
||||
for path, err, msg, si in denied:
|
||||
if "error" in si:
|
||||
lines.append(f" - {path}: {os.strerror(err)}; stat failed: {si['error']}")
|
||||
|
||||
# Collect all unique groups for usermod command
|
||||
all_groups = set()
|
||||
|
||||
# Show summary of denied devices by type with ownership info
|
||||
if device_types["kfd"]:
|
||||
lines.append(" • /dev/kfd: Permission denied")
|
||||
if len(required_groups["kfd"]) > 1:
|
||||
lines.append(" - Required group(s):")
|
||||
else:
|
||||
lines.append(" - Required group:")
|
||||
for group_info in required_groups["kfd"]:
|
||||
lines.append(
|
||||
" - {p}: {err}; owner={user}({uid}):{group}({gid});".format(
|
||||
p=path,
|
||||
err=os.strerror(err),
|
||||
user=si["user"],
|
||||
uid=si["uid"],
|
||||
group=si["group"],
|
||||
gid=si["gid"],
|
||||
" - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format(
|
||||
user=group_info["user"],
|
||||
uid=group_info["uid"],
|
||||
group=group_info["group"],
|
||||
gid=group_info["gid"],
|
||||
)
|
||||
)
|
||||
all_groups.add(group_info["group"])
|
||||
|
||||
lines.append("")
|
||||
lines.append("You can try:")
|
||||
lines.append(" • Add your user to the group that owns these devices:")
|
||||
lines.append(" sudo usermod -aG <group> \"$USER\"\n")
|
||||
if device_types["renderD"]:
|
||||
lines.append(f" • /dev/dri/renderD*: {len(device_types['renderD'])} device(s) denied")
|
||||
if len(required_groups["renderD"]) > 1:
|
||||
lines.append(" - Required group(s):")
|
||||
else:
|
||||
lines.append(" - Required group:")
|
||||
for group_info in required_groups["renderD"]:
|
||||
lines.append(
|
||||
" - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format(
|
||||
user=group_info["user"],
|
||||
uid=group_info["uid"],
|
||||
group=group_info["group"],
|
||||
gid=group_info["gid"],
|
||||
)
|
||||
)
|
||||
all_groups.add(group_info["group"])
|
||||
|
||||
if device_types["card"]:
|
||||
lines.append(f" • /dev/dri/card*: {len(device_types['card'])} device(s) denied")
|
||||
if len(required_groups["card"]) > 1:
|
||||
lines.append(" - Required group(s):")
|
||||
else:
|
||||
lines.append(" - Required group:")
|
||||
for group_info in required_groups["card"]:
|
||||
lines.append(
|
||||
" - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format(
|
||||
user=group_info["user"],
|
||||
uid=group_info["uid"],
|
||||
group=group_info["group"],
|
||||
gid=group_info["gid"],
|
||||
)
|
||||
)
|
||||
all_groups.add(group_info["group"])
|
||||
|
||||
# Generate usermod command with all unique groups
|
||||
groups_for_usermod = ",".join(sorted(all_groups))
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"To resolve this issue, try the following:",
|
||||
" • Add your user to the required group(s):",
|
||||
f" sudo usermod -aG {groups_for_usermod} \"$USER\"",
|
||||
" • Log out and log back in for the group changes to take effect",
|
||||
" • Alternatively, run this command with sudo/admin privileges",
|
||||
""
|
||||
])
|
||||
print("\n".join(lines))
|
||||
return False
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user