From 4518e7ee91687b3dcb1ba0d87ed31c364dfb6974 Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Tue, 7 Oct 2025 22:24:56 -0500 Subject: [PATCH] [SWDEV-556149] Fix group checking (#746) Signed-off-by: Arif, Maisam Signed-off-by: Pryor, Adam --- amdsmi_cli/amdsmi_commands.py | 27 ++------ amdsmi_cli/amdsmi_helpers.py | 112 +++++++++++++++++++++++++++------- 2 files changed, 96 insertions(+), 43 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 6913472ac3..b13d935401 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -217,19 +217,7 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles - # Perform one-time group check. If it fails, record that fact - # but do NOT abort—just mark that UUID should be "N/A" later. - _group_check_done = False - _group_in_groups = False - if not _group_check_done: - try: - self.helpers.check_required_groups() - _group_in_groups = True - except Exception as e: - _group_in_groups = False - # print the helper's error message exactly once: - print(f"{e}") - _group_check_done = True + _group_in_groups = self.helpers.check_required_groups() # Handle multiple GPUs handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list) @@ -247,14 +235,9 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: bdf = "N/A" - # Only fetch UUID if group check passed; otherwise force "N/A" - if _group_in_groups: - try: - uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu) - except amdsmi_exception.AmdSmiLibraryException: - uuid = "N/A" - else: - # user not in render/video → UUID is N/A + try: + uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu) + except amdsmi_exception.AmdSmiLibraryException: uuid = "N/A" try: @@ -6447,7 +6430,7 @@ class AMDSMICommands(): gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu) xgmi_values.append({"gpu" : gpu_id, "bdf" : gpu_bdf}) - # Populate header with just bdfs + # Populate header with just it's gpu_id self.logger.table_header += f"GPU{gpu_id}".rjust(13) # Cache processor handles for each BDF diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 08170c6603..4001e440a5 100755 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -29,6 +29,9 @@ import platform import re import sys import time +import glob +import errno +import pwd from enum import Enum from pathlib import Path @@ -115,7 +118,7 @@ class AMDSMIHelpers(): This is used to determine if the last set was successful or not. """ self._previous_set_success_check = status - + def get_previous_set_success_check(self): """Returns the previous set success check. This is used to determine if the last set was successful or not. @@ -822,7 +825,7 @@ class AMDSMIHelpers(): except amdsmi_interface.AmdSmiLibraryException as e: logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}") continue - + # If we never found a real min or max, set them to N/A if power_cap_min == amdsmi_interface.MaxUIntegerTypes.UINT64_T: power_cap_min = "N/A" @@ -931,7 +934,7 @@ class AMDSMIHelpers(): memory (NPS) partition mode. Please use `sudo amd-smi reset -r` AFTER successfully - changing the memory (NPS) partition mode. A successful driver reload + changing the memory (NPS) partition mode. A successful driver reload is REQUIRED in order to complete updating ALL GPUs in the hive to the requested partition mode. @@ -1133,34 +1136,101 @@ class AMDSMIHelpers(): for i in self.progressbar(range(timeInSeconds), title, 40, add_newline=add_newline): time.sleep(1) + def _user_name(self, uid: int) -> str: + try: + return pwd.getpwuid(uid).pw_name + except Exception: + # In containers, the UID may not resolve to a name + return str(uid) + def _group_name(self, gid: int) -> str: + try: + return grp.getgrgid(gid).gr_name + except Exception: + # In containers, the GID may not resolve to a name + return str(gid) + + # Attempt to grab file info + def _stat_info(self, path: str) -> dict: + try: + st = os.stat(path) + return { + "uid": st.st_uid, + "gid": st.st_gid, + "user": self._user_name(st.st_uid), + "group": self._group_name(st.st_gid), + } + except Exception as e: + return {"error": str(e)} + + def _try_open(self, path: str): + try: + fd = os.open(path, os.O_RDONLY) # Only read access is needed for permission check + os.close(fd) + return True, None, None + except OSError as e: + return False, e.errno, e.strerror + + # Check kfd and dri for EACCES/EPERM def check_required_groups(self): """ - Check if the current user is a member of the required groups. - If not, log a warning. + Check if the current user can access kfd and dri + Specifically, only care for EACCES/EPERM """ # Skip check if running as root. if os.geteuid() == 0: return - required_groups = {'video', 'render'} + paths_to_check = [] + if os.path.exists("/dev/kfd"): + paths_to_check.append("/dev/kfd") - user_groups = set() - for gid in set(os.getgroups()) | {os.getgid()}: - try: - user_groups.add(grp.getgrgid(gid).gr_name) - except Exception as e: - # Expected in containers when the name for this GID isn't defined - pass + # Render group correspond to /dev/dri/renderD* + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))] - missing_groups = required_groups - user_groups - if missing_groups: - msg = ( - "WARNING: User is missing the following required groups: %s. " - "Please add user to these groups." - ) % ", ".join(sorted(missing_groups)) - raise RuntimeError(msg) + # Video group corresponds to /dev/dri/card* + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))] + + if not paths_to_check: + return + + denied = [] + + for path in paths_to_check: + ok, err, msg = self._try_open(path) + if ok: + continue + # if permission denied or operation not permitted + if err in (errno.EACCES, errno.EPERM): + denied.append((path, err, msg, self._stat_info(path))) + + if denied: + lines = [] + lines.append("Permission needed to access required GPU device node(s):") + for path, err, msg, si in denied: + if "error" in si: + lines.append(f" - {path}: {os.strerror(err)}; stat failed: {si['error']}") + else: + lines.append( + " - {p}: {err}; owner={user}({uid}):{group}({gid});".format( + p=path, + err=os.strerror(err), + user=si["user"], + uid=si["uid"], + group=si["group"], + gid=si["gid"], + ) + ) + + lines.append("") + lines.append("You can try:") + lines.append(" • Add your user to the group that owns these devices:") + lines.append(" sudo usermod -aG \"$USER\"\n") + print("\n".join(lines)) + return False + + return True def _severity_as_string(self, error_severity, notify_type, for_filename): if error_severity == "non_fatal_uncorrected": @@ -1563,7 +1633,7 @@ class AMDSMIHelpers(): if not isinstance(data, (list, tuple)): logging.debug(f"Invalid data type for {context}: expected list/tuple, got {type(data)}") return "N/A" - + # Flatten nested lists and filter integers flat = [v for value in data for v in (value if isinstance(value, list) else [value]) if isinstance(v, int)] return round(sum(flat) / len(flat)) if flat else "N/A"