diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py index d1232b2b26..3a88ae5acc 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -42,6 +42,12 @@ def _print_error(e, destination): if __name__ == "__main__": + # Disable traceback before possible init errors in AMDSMICommands and AMDSMIParser + if "DEBUG" in sys.argv: + sys.tracebacklimit = 10 + else: + sys.tracebacklimit = -1 + amd_smi_commands = AMDSMICommands() amd_smi_parser = AMDSMIParser(amd_smi_commands.version, amd_smi_commands.list, @@ -80,7 +86,9 @@ if __name__ == "__main__": logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel]) # Disable traceback for non-debug log levels - if args.loglevel != "DEBUG": + if args.loglevel == "DEBUG": + sys.tracebacklimit = 10 + else: sys.tracebacklimit = -1 # Execute subcommands diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index fa3cfbe14a..a4f2aee64c 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -21,6 +21,7 @@ # import logging +import sys import threading import time @@ -42,7 +43,12 @@ class AMDSMICommands(): try: self.device_handles = amdsmi_interface.amdsmi_get_processor_handles() except amdsmi_exception.AmdSmiLibraryException as e: - raise e + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error('Unable to get devices, driver not initialized (amdgpu not found in modules)') + sys.exit(-1) + else: + raise e self.stop = '' self.all_arguments = False diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 8b88c10c05..f4914e01a4 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -141,8 +141,17 @@ class AMDSMIHelpers(): gpu_choices = {} gpu_choices_str = "" - # amdsmi_get_processor_handles returns the device_handles storted for gpu_id - device_handles = amdsmi_interface.amdsmi_get_processor_handles() + try: + # amdsmi_get_processor_handles returns the device_handles storted for gpu_id + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + except amdsmi_interface.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error('Unable to get device choices, driver not initialized (amdgpu not found in modules)') + sys.exit(-1) + else: + raise e + for gpu_id, device_handle in enumerate(device_handles): bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(device_handle) uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(device_handle) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_init.py b/projects/amdsmi/amdsmi_cli/amdsmi_init.py index 7e837026cc..a1cb955d02 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_init.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_init.py @@ -36,9 +36,9 @@ from amdsmi import amdsmi_interface from amdsmi import amdsmi_exception # Using basic python logging for user errors and development -logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.ERROR) # User level logging +logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR) # User level logging # This traceback limit only affects this file, once the code hit's the cli portion it get's reset to the user's preference -sys.tracebacklimit = -1 # Disable traceback for user errors +sys.tracebacklimit = -1 # Disable traceback when raising errors # On initial import set initialized variable AMDSMI_INITIALIZED = False @@ -47,11 +47,9 @@ AMD_VENDOR_ID = 4098 def check_amdgpu_driver(): """ Returns true if amdgpu is found in the list of initialized modules """ amd_gpu_status_file = Path("/sys/module/amdgpu/initstate") - if amd_gpu_status_file.exists(): - if amd_gpu_status_file.read_text(encoding='ascii').strip() == 'live': + if amd_gpu_status_file.read_text(encoding="ascii").strip() == "live": return True - return False @@ -61,17 +59,22 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS): Raises: err: AmdSmiLibraryException if not successful """ - # Check if amdgpu driver is up & Handle error gracefully + # # Check if amdgpu driver is up & Handle error gracefully if check_amdgpu_driver(): # Only init AMD GPUs for now, waiting for future support for AMD CPUs try: amdsmi_interface.amdsmi_init(flag) - except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as err: - raise err - logging.debug('AMDSMI initialized successfully') + except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error("Driver not loaded (amdgpu not found in modules)") + sys.exit(-1) + else: + raise e + logging.debug("AMDSMI initialized successfully, but initstate was not live") else: - logging.error('Driver not initialized (amdgpu not found in modules)') - exit(-1) + logging.error("Driver not found (amdgpu not found in modules)") + sys.exit(-1) def shut_down_amdsmi(): @@ -82,12 +85,13 @@ def shut_down_amdsmi(): """ try: amdsmi_interface.amdsmi_shut_down() - except amdsmi_exception.AmdSmiLibraryException as err: - raise err + except amdsmi_exception.AmdSmiLibraryException as e: + logging.error("Unable to cleanly shut down amd-smi-lib") + raise e def signal_handler(sig, frame): - logging.debug(f'Handling signal: {sig}') + logging.debug(f"Handling signal: {sig}") sys.exit(0) diff --git a/projects/amdsmi/py-interface/amdsmi_exception.py b/projects/amdsmi/py-interface/amdsmi_exception.py index 162e4f7a00..309831101b 100644 --- a/projects/amdsmi/py-interface/amdsmi_exception.py +++ b/projects/amdsmi/py-interface/amdsmi_exception.py @@ -72,7 +72,7 @@ class AmdSmiLibraryException(AmdSmiException): amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND : "AMDSMI_STATUS_NOT_FOUND - Device Not found", amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT : "AMDSMI_STATUS_NOT_INIT - Device not initialized", amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT : "AMDSMI_STATUS_NO_SLOT - No more free slot", - amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Processor driver not loaded", + amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Driver not loaded", amdsmi_wrapper.AMDSMI_STATUS_NO_DATA : "AMDSMI_STATUS_NO_DATA - No data was found for given input", amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE : "AMDSMI_STATUS_INSUFFICIENT_SIZE - Insufficient size for operation", amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE : "AMDSMI_STATUS_UNEXPECTED_SIZE - unexpected size of data was read",