Updated Driver Error Logging & Exceptions
Change-Id: Idd14904b33e82e4cb5d9f84c75978fe686a9b603
Signed-off-by: Maisam Arif <maisarif@amd.com>
[ROCm/amdsmi commit: 3fa96a9e02]
Этот коммит содержится в:
коммит произвёл
Maisam Arif
родитель
cbdb61c9c5
Коммит
4fceaa7c5c
@@ -42,6 +42,12 @@ def _print_error(e, destination):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Disable traceback before possible init errors in AMDSMICommands and AMDSMIParser
|
||||
if "DEBUG" in sys.argv:
|
||||
sys.tracebacklimit = 10
|
||||
else:
|
||||
sys.tracebacklimit = -1
|
||||
|
||||
amd_smi_commands = AMDSMICommands()
|
||||
amd_smi_parser = AMDSMIParser(amd_smi_commands.version,
|
||||
amd_smi_commands.list,
|
||||
@@ -80,7 +86,9 @@ if __name__ == "__main__":
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel])
|
||||
|
||||
# Disable traceback for non-debug log levels
|
||||
if args.loglevel != "DEBUG":
|
||||
if args.loglevel == "DEBUG":
|
||||
sys.tracebacklimit = 10
|
||||
else:
|
||||
sys.tracebacklimit = -1
|
||||
|
||||
# Execute subcommands
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
|
||||
@@ -42,7 +43,12 @@ class AMDSMICommands():
|
||||
try:
|
||||
self.device_handles = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
raise e
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
logging.error('Unable to get devices, driver not initialized (amdgpu not found in modules)')
|
||||
sys.exit(-1)
|
||||
else:
|
||||
raise e
|
||||
self.stop = ''
|
||||
self.all_arguments = False
|
||||
|
||||
|
||||
@@ -141,8 +141,17 @@ class AMDSMIHelpers():
|
||||
gpu_choices = {}
|
||||
gpu_choices_str = ""
|
||||
|
||||
# amdsmi_get_processor_handles returns the device_handles storted for gpu_id
|
||||
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
try:
|
||||
# amdsmi_get_processor_handles returns the device_handles storted for gpu_id
|
||||
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
except amdsmi_interface.AmdSmiLibraryException as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
logging.error('Unable to get device choices, driver not initialized (amdgpu not found in modules)')
|
||||
sys.exit(-1)
|
||||
else:
|
||||
raise e
|
||||
|
||||
for gpu_id, device_handle in enumerate(device_handles):
|
||||
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(device_handle)
|
||||
uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(device_handle)
|
||||
|
||||
@@ -36,9 +36,9 @@ from amdsmi import amdsmi_interface
|
||||
from amdsmi import amdsmi_exception
|
||||
|
||||
# Using basic python logging for user errors and development
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.ERROR) # User level logging
|
||||
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR) # User level logging
|
||||
# This traceback limit only affects this file, once the code hit's the cli portion it get's reset to the user's preference
|
||||
sys.tracebacklimit = -1 # Disable traceback for user errors
|
||||
sys.tracebacklimit = -1 # Disable traceback when raising errors
|
||||
|
||||
# On initial import set initialized variable
|
||||
AMDSMI_INITIALIZED = False
|
||||
@@ -47,11 +47,9 @@ AMD_VENDOR_ID = 4098
|
||||
def check_amdgpu_driver():
|
||||
""" Returns true if amdgpu is found in the list of initialized modules """
|
||||
amd_gpu_status_file = Path("/sys/module/amdgpu/initstate")
|
||||
|
||||
if amd_gpu_status_file.exists():
|
||||
if amd_gpu_status_file.read_text(encoding='ascii').strip() == 'live':
|
||||
if amd_gpu_status_file.read_text(encoding="ascii").strip() == "live":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
@@ -61,17 +59,22 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS):
|
||||
Raises:
|
||||
err: AmdSmiLibraryException if not successful
|
||||
"""
|
||||
# Check if amdgpu driver is up & Handle error gracefully
|
||||
# # Check if amdgpu driver is up & Handle error gracefully
|
||||
if check_amdgpu_driver():
|
||||
# Only init AMD GPUs for now, waiting for future support for AMD CPUs
|
||||
try:
|
||||
amdsmi_interface.amdsmi_init(flag)
|
||||
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as err:
|
||||
raise err
|
||||
logging.debug('AMDSMI initialized successfully')
|
||||
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
logging.error("Driver not loaded (amdgpu not found in modules)")
|
||||
sys.exit(-1)
|
||||
else:
|
||||
raise e
|
||||
logging.debug("AMDSMI initialized successfully, but initstate was not live")
|
||||
else:
|
||||
logging.error('Driver not initialized (amdgpu not found in modules)')
|
||||
exit(-1)
|
||||
logging.error("Driver not found (amdgpu not found in modules)")
|
||||
sys.exit(-1)
|
||||
|
||||
|
||||
def shut_down_amdsmi():
|
||||
@@ -82,12 +85,13 @@ def shut_down_amdsmi():
|
||||
"""
|
||||
try:
|
||||
amdsmi_interface.amdsmi_shut_down()
|
||||
except amdsmi_exception.AmdSmiLibraryException as err:
|
||||
raise err
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.error("Unable to cleanly shut down amd-smi-lib")
|
||||
raise e
|
||||
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
logging.debug(f'Handling signal: {sig}')
|
||||
logging.debug(f"Handling signal: {sig}")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ class AmdSmiLibraryException(AmdSmiException):
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND : "AMDSMI_STATUS_NOT_FOUND - Device Not found",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT : "AMDSMI_STATUS_NOT_INIT - Device not initialized",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT : "AMDSMI_STATUS_NO_SLOT - No more free slot",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Processor driver not loaded",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Driver not loaded",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NO_DATA : "AMDSMI_STATUS_NO_DATA - No data was found for given input",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE : "AMDSMI_STATUS_INSUFFICIENT_SIZE - Insufficient size for operation",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE : "AMDSMI_STATUS_UNEXPECTED_SIZE - unexpected size of data was read",
|
||||
|
||||
Ссылка в новой задаче
Block a user