Updated Driver Error Logging & Exceptions

Change-Id: Idd14904b33e82e4cb5d9f84c75978fe686a9b603
Signed-off-by: Maisam Arif <maisarif@amd.com>


[ROCm/amdsmi commit: 3fa96a9e02]
Этот коммит содержится в:
Maisam Arif
2023-09-27 02:37:46 -05:00
коммит произвёл Maisam Arif
родитель cbdb61c9c5
Коммит 4fceaa7c5c
5 изменённых файлов: 46 добавлений и 19 удалений
+9 -1
Просмотреть файл
@@ -42,6 +42,12 @@ def _print_error(e, destination):
if __name__ == "__main__":
# Disable traceback before possible init errors in AMDSMICommands and AMDSMIParser
if "DEBUG" in sys.argv:
sys.tracebacklimit = 10
else:
sys.tracebacklimit = -1
amd_smi_commands = AMDSMICommands()
amd_smi_parser = AMDSMIParser(amd_smi_commands.version,
amd_smi_commands.list,
@@ -80,7 +86,9 @@ if __name__ == "__main__":
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel])
# Disable traceback for non-debug log levels
if args.loglevel != "DEBUG":
if args.loglevel == "DEBUG":
sys.tracebacklimit = 10
else:
sys.tracebacklimit = -1
# Execute subcommands
+7 -1
Просмотреть файл
@@ -21,6 +21,7 @@
#
import logging
import sys
import threading
import time
@@ -42,7 +43,12 @@ class AMDSMICommands():
try:
self.device_handles = amdsmi_interface.amdsmi_get_processor_handles()
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.error('Unable to get devices, driver not initialized (amdgpu not found in modules)')
sys.exit(-1)
else:
raise e
self.stop = ''
self.all_arguments = False
+11 -2
Просмотреть файл
@@ -141,8 +141,17 @@ class AMDSMIHelpers():
gpu_choices = {}
gpu_choices_str = ""
# amdsmi_get_processor_handles returns the device_handles storted for gpu_id
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
try:
# amdsmi_get_processor_handles returns the device_handles storted for gpu_id
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
except amdsmi_interface.AmdSmiLibraryException as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.error('Unable to get device choices, driver not initialized (amdgpu not found in modules)')
sys.exit(-1)
else:
raise e
for gpu_id, device_handle in enumerate(device_handles):
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(device_handle)
uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(device_handle)
+18 -14
Просмотреть файл
@@ -36,9 +36,9 @@ from amdsmi import amdsmi_interface
from amdsmi import amdsmi_exception
# Using basic python logging for user errors and development
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.ERROR) # User level logging
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR) # User level logging
# This traceback limit only affects this file, once the code hit's the cli portion it get's reset to the user's preference
sys.tracebacklimit = -1 # Disable traceback for user errors
sys.tracebacklimit = -1 # Disable traceback when raising errors
# On initial import set initialized variable
AMDSMI_INITIALIZED = False
@@ -47,11 +47,9 @@ AMD_VENDOR_ID = 4098
def check_amdgpu_driver():
""" Returns true if amdgpu is found in the list of initialized modules """
amd_gpu_status_file = Path("/sys/module/amdgpu/initstate")
if amd_gpu_status_file.exists():
if amd_gpu_status_file.read_text(encoding='ascii').strip() == 'live':
if amd_gpu_status_file.read_text(encoding="ascii").strip() == "live":
return True
return False
@@ -61,17 +59,22 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS):
Raises:
err: AmdSmiLibraryException if not successful
"""
# Check if amdgpu driver is up & Handle error gracefully
# # Check if amdgpu driver is up & Handle error gracefully
if check_amdgpu_driver():
# Only init AMD GPUs for now, waiting for future support for AMD CPUs
try:
amdsmi_interface.amdsmi_init(flag)
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as err:
raise err
logging.debug('AMDSMI initialized successfully')
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.error("Driver not loaded (amdgpu not found in modules)")
sys.exit(-1)
else:
raise e
logging.debug("AMDSMI initialized successfully, but initstate was not live")
else:
logging.error('Driver not initialized (amdgpu not found in modules)')
exit(-1)
logging.error("Driver not found (amdgpu not found in modules)")
sys.exit(-1)
def shut_down_amdsmi():
@@ -82,12 +85,13 @@ def shut_down_amdsmi():
"""
try:
amdsmi_interface.amdsmi_shut_down()
except amdsmi_exception.AmdSmiLibraryException as err:
raise err
except amdsmi_exception.AmdSmiLibraryException as e:
logging.error("Unable to cleanly shut down amd-smi-lib")
raise e
def signal_handler(sig, frame):
logging.debug(f'Handling signal: {sig}')
logging.debug(f"Handling signal: {sig}")
sys.exit(0)
+1 -1
Просмотреть файл
@@ -72,7 +72,7 @@ class AmdSmiLibraryException(AmdSmiException):
amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND : "AMDSMI_STATUS_NOT_FOUND - Device Not found",
amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT : "AMDSMI_STATUS_NOT_INIT - Device not initialized",
amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT : "AMDSMI_STATUS_NO_SLOT - No more free slot",
amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Processor driver not loaded",
amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Driver not loaded",
amdsmi_wrapper.AMDSMI_STATUS_NO_DATA : "AMDSMI_STATUS_NO_DATA - No data was found for given input",
amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE : "AMDSMI_STATUS_INSUFFICIENT_SIZE - Insufficient size for operation",
amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE : "AMDSMI_STATUS_UNEXPECTED_SIZE - unexpected size of data was read",