diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index b13d935401..5b2327ecff 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -217,7 +217,9 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles - _group_in_groups = self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True # Handle multiple GPUs handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list) @@ -246,7 +248,7 @@ class AMDSMICommands(): node_id = kfd_info['node_id'] partition_id = kfd_info['current_partition_id'] except amdsmi_exception.AmdSmiLibraryException as e: - kfd_id = node_id = "N/A" + kfd_id = node_id = partition_id = "N/A" logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) # CSV format is intentionally aligned with Host @@ -273,9 +275,6 @@ class AMDSMICommands(): "hip_uuid": "N/A", } - # __Override__ hip_uuid if the group check failed - if not _group_in_groups: - enumeration_info["hip_uuid"] = "N/A" # now store all the fields exactly once: if enumeration_info['drm_render'] == "N/A": self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render']) @@ -455,7 +454,7 @@ class AMDSMICommands(): current_platform_values += [args.partition] if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True if self.helpers.is_linux() and self.helpers.is_baremetal(): @@ -1586,7 +1585,7 @@ class AMDSMICommands(): args.gpu = self.device_handles if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True # Handle watch logic, will only enter this block once @@ -3641,7 +3640,7 @@ class AMDSMICommands(): self.logger.table_header = ''.rjust(12) if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True p2p_status_cache = {} @@ -4534,6 +4533,10 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + # Handle multiple GPUs handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.set_gpu) if handled_multiple_gpus: @@ -5072,10 +5075,6 @@ class AMDSMICommands(): if core: args.core = core - if not self.group_check_printed: - self.helpers.check_required_groups() - self.group_check_printed = True - # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", @@ -5282,7 +5281,7 @@ class AMDSMICommands(): args.gpu = self.device_handles if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True # Handle multiple GPUs @@ -5652,7 +5651,7 @@ class AMDSMICommands(): args.gpu = self.device_handles if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True # If all arguments are False, the print all values @@ -6414,7 +6413,7 @@ class AMDSMICommands(): self.logger.table_header = ''.rjust(7) if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True # Populate the possible gpus and their bdfs @@ -6753,7 +6752,7 @@ class AMDSMICommands(): args.accelerator = accelerator if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True ########################################### @@ -7118,7 +7117,7 @@ class AMDSMICommands(): message) if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=True) self.group_check_printed = True if not args.cper: @@ -7174,7 +7173,7 @@ class AMDSMICommands(): # check groups first if not self.group_check_printed: - self.helpers.check_required_groups() + self.helpers.check_required_groups(check_render=True, check_video=False) self.group_check_printed = True processors = amdsmi_interface.amdsmi_get_processor_handles() @@ -7382,6 +7381,11 @@ class AMDSMICommands(): print("No GPUs on machine") return + # Check that KFD permissions are available + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + device = devices[i] listener = amdsmi_interface.AmdSmiEventReader(device, amdsmi_interface.AmdSmiEvtNotificationType) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 4001e440a5..dffb9f743f 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1171,29 +1171,36 @@ class AMDSMIHelpers(): except OSError as e: return False, e.errno, e.strerror - # Check kfd and dri for EACCES/EPERM - def check_required_groups(self): + def check_required_groups(self, check_render=True, check_video=True): """ Check if the current user can access kfd and dri Specifically, only care for EACCES/EPERM + + Args: + check_render (bool): Whether to check /dev/kfd & /dev/dri/renderD* devices. Defaults to True. + check_video (bool): Whether to check /dev/dri/card* devices. Defaults to True. + + Returns: + bool: True if all checked devices are accessible, False if any permission errors found """ # Skip check if running as root. if os.geteuid() == 0: - return + return True paths_to_check = [] - if os.path.exists("/dev/kfd"): + + # Only add paths for device types that are flagged for checking + if check_render and os.path.exists("/dev/kfd"): paths_to_check.append("/dev/kfd") - - # Render group correspond to /dev/dri/renderD* - paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))] + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))] # Video group corresponds to /dev/dri/card* - paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))] + if check_video: + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))] if not paths_to_check: - return + return True denied = [] @@ -1206,27 +1213,100 @@ class AMDSMIHelpers(): denied.append((path, err, msg, self._stat_info(path))) if denied: + # Collect unique group info from denied devices + required_groups = {"kfd": [], "renderD": [], "card": []} + device_types = {"kfd": [], "renderD": [], "card": []} + + for path, err, msg, si in denied: + if "error" not in si: + # Categorize devices and collect unique group info + if "/dev/kfd" in path: + device_types["kfd"].append(path) + required_groups["kfd"].append(si) + elif "/dev/dri/renderD" in path: + device_types["renderD"].append(path) + required_groups["renderD"].append(si) + elif "/dev/dri/card" in path: + device_types["card"].append(path) + required_groups["card"].append(si) + + # Deduplicate group info by converting to tuple for hashing + for device_type in required_groups: + unique_groups = list(dict.fromkeys( + tuple(sorted(d.items())) for d in required_groups[device_type] + )) + required_groups[device_type] = [dict(item) for item in unique_groups] + lines = [] lines.append("Permission needed to access required GPU device node(s):") - for path, err, msg, si in denied: - if "error" in si: - lines.append(f" - {path}: {os.strerror(err)}; stat failed: {si['error']}") + + # Collect all unique groups for usermod command + all_groups = set() + + # Show summary of denied devices by type with ownership info + if device_types["kfd"]: + lines.append(" • /dev/kfd: Permission denied") + if len(required_groups["kfd"]) > 1: + lines.append(" - Required group(s):") else: + lines.append(" - Required group:") + for group_info in required_groups["kfd"]: lines.append( - " - {p}: {err}; owner={user}({uid}):{group}({gid});".format( - p=path, - err=os.strerror(err), - user=si["user"], - uid=si["uid"], - group=si["group"], - gid=si["gid"], + " - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format( + user=group_info["user"], + uid=group_info["uid"], + group=group_info["group"], + gid=group_info["gid"], ) ) + all_groups.add(group_info["group"]) - lines.append("") - lines.append("You can try:") - lines.append(" • Add your user to the group that owns these devices:") - lines.append(" sudo usermod -aG \"$USER\"\n") + if device_types["renderD"]: + lines.append(f" • /dev/dri/renderD*: {len(device_types['renderD'])} device(s) denied") + if len(required_groups["renderD"]) > 1: + lines.append(" - Required group(s):") + else: + lines.append(" - Required group:") + for group_info in required_groups["renderD"]: + lines.append( + " - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format( + user=group_info["user"], + uid=group_info["uid"], + group=group_info["group"], + gid=group_info["gid"], + ) + ) + all_groups.add(group_info["group"]) + + if device_types["card"]: + lines.append(f" • /dev/dri/card*: {len(device_types['card'])} device(s) denied") + if len(required_groups["card"]) > 1: + lines.append(" - Required group(s):") + else: + lines.append(" - Required group:") + for group_info in required_groups["card"]: + lines.append( + " - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format( + user=group_info["user"], + uid=group_info["uid"], + group=group_info["group"], + gid=group_info["gid"], + ) + ) + all_groups.add(group_info["group"]) + + # Generate usermod command with all unique groups + groups_for_usermod = ",".join(sorted(all_groups)) + + lines.extend([ + "", + "To resolve this issue, try the following:", + " • Add your user to the required group(s):", + f" sudo usermod -aG {groups_for_usermod} \"$USER\"", + " • Log out and log back in for the group changes to take effect", + " • Alternatively, run this command with sudo/admin privileges", + "" + ]) print("\n".join(lines)) return False