Fixed log handling and exceptions
Updated exceptions Added driver load exception Fixed logging override by removing previous log handlers Updated debug output to use gpu_id vs C-pointer Removed AmdSmiRetcode class in favor of using the wrapper directly Added traceback limits for clean errors (Not in debug) Change-Id: Ia02bb842b8f60d9ab4b68b7f8b1afda30b1c021c Signed-off-by: Maisam Arif <maisarif@amd.com>
This commit is contained in:
@@ -102,7 +102,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
``` bash
|
||||
@@ -123,7 +123,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -153,7 +153,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -176,7 +176,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -215,7 +215,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -245,7 +245,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -271,7 +271,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -296,7 +296,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -323,7 +323,7 @@ Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
|
||||
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default).
|
||||
```
|
||||
|
||||
## Disclaimer
|
||||
|
||||
@@ -66,14 +66,22 @@ if __name__ == "__main__":
|
||||
amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value
|
||||
if args.file:
|
||||
amd_smi_commands.logger.destination = args.file
|
||||
if args.loglevel:
|
||||
logging_dict = {'DEBUG' : logging.DEBUG,
|
||||
'INFO' : logging.INFO,
|
||||
'WARNING': logging.WARNING,
|
||||
'ERROR': logging.ERROR,
|
||||
'CRITICAL': logging.CRITICAL}
|
||||
# Enable debug logs on amdsmi library ie. RSMI_LOGGING = 1 in environment or otherwise
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel])
|
||||
|
||||
# Remove previous log handlers
|
||||
for handler in logging.root.handlers[:]:
|
||||
logging.root.removeHandler(handler)
|
||||
|
||||
logging_dict = {'DEBUG' : logging.DEBUG,
|
||||
'INFO' : logging.INFO,
|
||||
'WARNING': logging.WARNING,
|
||||
'ERROR': logging.ERROR,
|
||||
'CRITICAL': logging.CRITICAL}
|
||||
# To enable debug logs on rocm-smi library set RSMI_LOGGING = 1 in environment
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel])
|
||||
|
||||
# Disable traceback for non-debug log levels
|
||||
if args.loglevel != "DEBUG":
|
||||
sys.tracebacklimit = -1
|
||||
|
||||
# Execute subcommands
|
||||
args.func(args)
|
||||
|
||||
+114
-75
@@ -191,6 +191,9 @@ class AMDSMICommands():
|
||||
|
||||
static_dict = {}
|
||||
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
if args.asic:
|
||||
try:
|
||||
asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu)
|
||||
@@ -203,7 +206,7 @@ class AMDSMICommands():
|
||||
static_dict['asic'] = asic_info
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
static_dict['asic'] = "N/A"
|
||||
logging.debug("Failed to get asic info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.bus:
|
||||
bus_output_info = {}
|
||||
|
||||
@@ -239,13 +242,13 @@ class AMDSMICommands():
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
bus_info = "N/A"
|
||||
logging.debug("Failed to get bus info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
bus_output_info['bdf'] = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
bus_output_info['bdf'] = "N/A"
|
||||
logging.debug("Failed to get bdf for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
bus_output_info.update(bus_info)
|
||||
static_dict['bus'] = bus_output_info
|
||||
@@ -255,7 +258,7 @@ class AMDSMICommands():
|
||||
static_dict['vbios'] = vbios_info
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
static_dict['vbios'] = "N/A"
|
||||
logging.debug("Failed to get vbios info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get vbios info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if args.board:
|
||||
@@ -271,7 +274,7 @@ class AMDSMICommands():
|
||||
static_dict['board'] = board_info
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
static_dict['board'] = "N/A"
|
||||
logging.debug("Failed to get board info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.limit:
|
||||
# Power limits
|
||||
try:
|
||||
@@ -283,7 +286,7 @@ class AMDSMICommands():
|
||||
power_limit_error = True
|
||||
max_power_limit = "N/A"
|
||||
current_power_limit = "N/A"
|
||||
logging.debug("Failed to get power cap info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# Edge temperature limits
|
||||
try:
|
||||
@@ -293,7 +296,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
slowdown_temp_edge_limit_error = True
|
||||
slowdown_temp_edge_limit = "N/A"
|
||||
logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if slowdown_temp_edge_limit == 0:
|
||||
slowdown_temp_edge_limit_error = True
|
||||
@@ -306,7 +309,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
shutdown_temp_edge_limit_error = True
|
||||
shutdown_temp_edge_limit = "N/A"
|
||||
logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if shutdown_temp_edge_limit == 0:
|
||||
shutdown_temp_edge_limit_error = True
|
||||
@@ -320,7 +323,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
slowdown_temp_hotspot_limit_error = True
|
||||
slowdown_temp_hotspot_limit = "N/A"
|
||||
logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
shutdown_temp_hotspot_limit_error = False
|
||||
@@ -329,7 +332,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
shutdown_temp_hotspot_limit_error = True
|
||||
shutdown_temp_hotspot_limit = "N/A"
|
||||
logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
|
||||
# VRAM temperature limits
|
||||
@@ -340,7 +343,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
slowdown_temp_vram_limit_error = True
|
||||
slowdown_temp_vram_limit = "N/A"
|
||||
logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
shutdown_temp_vram_limit_error = False
|
||||
@@ -349,7 +352,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
shutdown_temp_vram_limit_error = True
|
||||
shutdown_temp_vram_limit = "N/A"
|
||||
logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
unit = 'W'
|
||||
@@ -392,7 +395,7 @@ class AMDSMICommands():
|
||||
static_dict['driver'] = driver_info
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
static_dict['driver'] = "N/A"
|
||||
logging.debug("Failed to get driver info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get driver info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
|
||||
if args.ras:
|
||||
@@ -400,7 +403,7 @@ class AMDSMICommands():
|
||||
static_dict['ras'] = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
static_dict['ras'] = "N/A"
|
||||
logging.debug("Failed to get ras block features for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.vram:
|
||||
try:
|
||||
vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu)
|
||||
@@ -417,7 +420,7 @@ class AMDSMICommands():
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
vram_info = "N/A"
|
||||
logging.debug("Failed to get vram info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict['vram'] = vram_info
|
||||
|
||||
@@ -427,13 +430,13 @@ class AMDSMICommands():
|
||||
numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
numa_node_number = "N/A"
|
||||
logging.debug("Failed to get numa node number for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
numa_affinity = amdsmi_interface.amdsmi_get_gpu_topo_numa_affinity(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
numa_affinity = "N/A"
|
||||
logging.debug("Failed to get numa affinity for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict['numa'] = {'node' : numa_node_number,
|
||||
'affinity' : numa_affinity}
|
||||
@@ -500,6 +503,10 @@ class AMDSMICommands():
|
||||
args.gpu = device_handle
|
||||
|
||||
fw_list = {}
|
||||
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
if args.fw_list:
|
||||
try:
|
||||
fw_info = amdsmi_interface.amdsmi_get_fw_info(args.gpu)
|
||||
@@ -519,7 +526,7 @@ class AMDSMICommands():
|
||||
fw_list.update(fw_info)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
fw_list['fw_list'] = "N/A"
|
||||
logging.debug("Failed to get firmware info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get firmware info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
multiple_devices_csv_override = False
|
||||
# Convert and store output by pid for csv format
|
||||
@@ -586,13 +593,16 @@ class AMDSMICommands():
|
||||
values_dict = {}
|
||||
bad_page_err_output = ''
|
||||
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
try:
|
||||
bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu)
|
||||
bad_page_error = False
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
bad_page_error = True
|
||||
bad_page_err_output = "N/A"
|
||||
logging.debug("Failed to get bad page info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if bad_page_info == "No bad pages found.":
|
||||
bad_page_error = True
|
||||
@@ -798,6 +808,10 @@ class AMDSMICommands():
|
||||
|
||||
# Add timestamp and store values for specified arguments
|
||||
values_dict = {}
|
||||
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if args.usage:
|
||||
try:
|
||||
@@ -818,7 +832,7 @@ class AMDSMICommands():
|
||||
values_dict['usage'] = engine_usage
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['usage'] = "N/A"
|
||||
logging.debug("Failed to get gpu activity for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get gpu activity for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.power:
|
||||
power_dict = {'current_power': "N/A",
|
||||
'current_gfx_voltage': "N/A",
|
||||
@@ -845,7 +859,7 @@ class AMDSMICommands():
|
||||
power_dict['power_limit'] = power_info['power_limit']
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get power info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get power info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
is_power_management_enabled = amdsmi_interface.amdsmi_is_gpu_power_management_enabled(args.gpu)
|
||||
@@ -854,7 +868,7 @@ class AMDSMICommands():
|
||||
else:
|
||||
power_dict['power_management'] = "DISABLED"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get power management status for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
values_dict['power'] = power_dict
|
||||
if args.clock:
|
||||
@@ -870,20 +884,20 @@ class AMDSMICommands():
|
||||
|
||||
clocks['gfx'] = gfx_clock
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get gfx clock info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
# is_clk_locked = amdsmi_interface.amdsmi_is_clk_locked(args.gpu, amdsmi_interface.AmdSmiClkType.GFX)
|
||||
is_clk_locked = "N/A"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
is_clk_locked = "N/A"
|
||||
logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
|
||||
logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if isinstance(clocks['gfx'], dict):
|
||||
clocks['gfx']['is_clk_locked'] = is_clk_locked
|
||||
else:
|
||||
clocks['gfx'] = {'is_clk_locked': is_clk_locked}
|
||||
|
||||
|
||||
try:
|
||||
mem_clock = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_interface.AmdSmiClkType.MEM)
|
||||
|
||||
@@ -894,7 +908,7 @@ class AMDSMICommands():
|
||||
|
||||
clocks['mem'] = mem_clock
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get mem clock info for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
values_dict['clock'] = clocks
|
||||
if args.temperature:
|
||||
@@ -903,14 +917,14 @@ class AMDSMICommands():
|
||||
args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
temperature_edge_current = "N/A"
|
||||
logging.debug("Failed to get current edge temperature for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get current edge temperature for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
temperature_edge_limit = amdsmi_interface.amdsmi_get_temp_metric(
|
||||
args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
temperature_edge_limit = "N/A"
|
||||
logging.debug("Failed to get edge temperature limit for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get edge temperature limit for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# If edge limit is reporting 0 then set the current edge temp to N/A
|
||||
if temperature_edge_limit == 0:
|
||||
@@ -921,14 +935,14 @@ class AMDSMICommands():
|
||||
args.gpu, amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
temperature_hotspot_current = "N/A"
|
||||
logging.debug("Failed to get current hotspot temperature for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get current hotspot temperature for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
temperature_vram_current = amdsmi_interface.amdsmi_get_temp_metric(
|
||||
args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
temperature_vram_current = "N/A"
|
||||
logging.debug("Failed to get current vram temperature for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get current vram temperature for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
temperatures = {'edge': temperature_edge_current,
|
||||
'hotspot': temperature_hotspot_current,
|
||||
@@ -950,7 +964,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
ecc_count['correctable'] = "N/A"
|
||||
ecc_count['uncorrectable'] = "N/A"
|
||||
logging.debug("Failed to get ecc count for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get ecc count for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
values_dict['ecc'] = ecc_count
|
||||
if args.ecc_block:
|
||||
@@ -966,14 +980,14 @@ class AMDSMICommands():
|
||||
'uncorrectable': ecc_count['uncorrectable_count']}
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
ecc_count = "N/A"
|
||||
logging.debug("Failed to get ecc count for gpu %s at block %s | %s", args.gpu, gpu_block, e.get_error_info())
|
||||
logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info())
|
||||
|
||||
ecc_dict[state['block']] = {'correctable' : ecc_count,
|
||||
'uncorrectable': ecc_count}
|
||||
values_dict['ecc_block'] = ecc_dict
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['ecc_block'] = "N/A"
|
||||
logging.debug("Failed to get ecc block features for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get ecc block features for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.pcie:
|
||||
pcie_dict = {'current_width': "N/A",
|
||||
'current_speed': "N/A",
|
||||
@@ -997,13 +1011,13 @@ class AMDSMICommands():
|
||||
unit = 'GT/s'
|
||||
pcie_link_status['current_speed'] = f"{pcie_link_status['pcie_speed']} {unit}"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pcie link status for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
|
||||
pcie_dict['replay_count'] = pci_replay_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pci replay counter for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
|
||||
@@ -1024,7 +1038,7 @@ class AMDSMICommands():
|
||||
pcie_dict['current_bandwith_received'] = received
|
||||
pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
values_dict['pcie'] = pcie_dict
|
||||
if args.fan:
|
||||
@@ -1077,7 +1091,7 @@ class AMDSMICommands():
|
||||
values_dict['voltage_curve'] = voltage_point_dict
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['voltage_curve'] = "N/A"
|
||||
logging.debug("Failed to get voltage curve for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.overdrive:
|
||||
try:
|
||||
overdrive_level = amdsmi_interface.amdsmi_get_gpu_overdrive_level(args.gpu)
|
||||
@@ -1089,14 +1103,14 @@ class AMDSMICommands():
|
||||
values_dict['overdrive'] = overdrive_level
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['overdrive'] = "N/A"
|
||||
logging.debug("Failed to get overdrive level for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get overdrive level for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.perf_level:
|
||||
try:
|
||||
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
|
||||
values_dict['perf_level'] = perf_level
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['perf_level'] = "N/A"
|
||||
logging.debug("Failed to get perf level for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get perf level for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if args.xgmi_err:
|
||||
@@ -1104,7 +1118,7 @@ class AMDSMICommands():
|
||||
values_dict['xgmi_err'] = amdsmi_interface.amdsmi_gpu_xgmi_error_status(args.gpu)
|
||||
except amdsmi_interface.AmdSmiLibraryException as e:
|
||||
values_dict['xgmi_err'] = "N/A"
|
||||
logging.debug("Failed to get xgmi error status for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if args.energy:
|
||||
try:
|
||||
energy_dict = amdsmi_interface.amdsmi_get_energy_count(args.gpu)
|
||||
@@ -1140,19 +1154,19 @@ class AMDSMICommands():
|
||||
total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM)
|
||||
memory_usage['total_vram'] = total_vram // (1024*1024)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get total VRAM memory for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get total VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
total_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
|
||||
memory_usage['total_visible_vram'] = total_visible_vram // (1024*1024)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
total_gtt = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT)
|
||||
memory_usage['total_gtt'] = total_gtt // (1024*1024)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get total GTT memory for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get total GTT memory for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# Used VRAM
|
||||
try:
|
||||
@@ -1160,19 +1174,19 @@ class AMDSMICommands():
|
||||
memory_usage['used_vram'] = used_vram // (1024*1024)
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get used VRAM memory for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get used VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
used_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
|
||||
memory_usage['used_visible_vram'] = used_visible_vram // (1024*1024)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
used_gtt = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT)
|
||||
memory_usage['used_gtt'] = used_gtt // (1024*1024)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get used GTT memory for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get used GTT memory for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# Free VRAM
|
||||
if memory_usage['total_vram'] != "N/A" and memory_usage['used_vram'] != "N/A":
|
||||
@@ -1288,11 +1302,14 @@ class AMDSMICommands():
|
||||
else:
|
||||
raise IndexError("args.gpu should not be an empty list")
|
||||
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
# Populate initial processes
|
||||
try:
|
||||
process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get process list for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
raise e
|
||||
|
||||
filtered_process_values = []
|
||||
@@ -1301,7 +1318,7 @@ class AMDSMICommands():
|
||||
process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process_handle)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
process_info = "N/A"
|
||||
logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", args.gpu, process_handle, e.get_error_info())
|
||||
logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", gpu_id, process_handle, e.get_error_info())
|
||||
filtered_process_values.append({'process_info': process_info})
|
||||
continue
|
||||
|
||||
@@ -1467,7 +1484,10 @@ class AMDSMICommands():
|
||||
src_gpu_links[dest_gpu_key] = bool(dest_gpu_link_status)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_links[dest_gpu_key] = "N/A"
|
||||
logging.debug("Failed to get link status for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info())
|
||||
logging.debug("Failed to get link status for %s to %s | %s",
|
||||
self.helpers.get_gpu_id_from_device_handle(src_gpu),
|
||||
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
|
||||
e.get_error_info())
|
||||
|
||||
topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links
|
||||
|
||||
@@ -1487,7 +1507,10 @@ class AMDSMICommands():
|
||||
src_gpu_weight[dest_gpu_key] = dest_gpu_link_weight
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_weight[dest_gpu_key] = "N/A"
|
||||
logging.debug("Failed to get link weight for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info())
|
||||
logging.debug("Failed to get link weight for %s to %s | %s",
|
||||
self.helpers.get_gpu_id_from_device_handle(src_gpu),
|
||||
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
|
||||
e.get_error_info())
|
||||
|
||||
topo_values[src_gpu_index]['weight'] = src_gpu_weight
|
||||
|
||||
@@ -1507,7 +1530,10 @@ class AMDSMICommands():
|
||||
src_gpu_hops[dest_gpu_key] = dest_gpu_hops
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_hops[dest_gpu_key] = "N/A"
|
||||
logging.debug("Failed to get link hops for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info())
|
||||
logging.debug("Failed to get link hops for %s to %s | %s",
|
||||
self.helpers.get_gpu_id_from_device_handle(src_gpu),
|
||||
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
|
||||
e.get_error_info())
|
||||
|
||||
topo_values[src_gpu_index]['hops'] = src_gpu_hops
|
||||
|
||||
@@ -1532,7 +1558,10 @@ class AMDSMICommands():
|
||||
src_gpu_link_type[dest_gpu_key] = "XGMI"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_link_type[dest_gpu_key] = "N/A"
|
||||
logging.debug("Failed to get link type for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info())
|
||||
logging.debug("Failed to get link type for %s to %s | %s",
|
||||
self.helpers.get_gpu_id_from_device_handle(src_gpu),
|
||||
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
|
||||
e.get_error_info())
|
||||
|
||||
topo_values[src_gpu_index]['link_type'] = src_gpu_link_type
|
||||
|
||||
@@ -1556,7 +1585,10 @@ class AMDSMICommands():
|
||||
continue
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_link_type[dest_gpu_key] = "N/A"
|
||||
logging.debug("Failed to get link type for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info())
|
||||
logging.debug("Failed to get link type for %s to %s | %s",
|
||||
self.helpers.get_gpu_id_from_device_handle(src_gpu),
|
||||
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
|
||||
e.get_error_info())
|
||||
|
||||
try:
|
||||
min_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu)['min_bandwidth']
|
||||
@@ -1565,6 +1597,10 @@ class AMDSMICommands():
|
||||
src_gpu_link_type[dest_gpu_key] = f'{min_bw}-{max_bw}'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_link_type[dest_gpu_key] = e.get_error_info()
|
||||
logging.debug("Failed to get min max bandwidth for %s to %s | %s",
|
||||
self.helpers.get_gpu_id_from_device_handle(src_gpu),
|
||||
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
|
||||
e.get_error_info())
|
||||
|
||||
topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type
|
||||
|
||||
@@ -1638,7 +1674,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
amdsmi_interface.amdsmi_set_gpu_fan_speed(args.gpu, 0, args.fan)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e
|
||||
|
||||
@@ -1648,7 +1684,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e
|
||||
|
||||
@@ -1659,7 +1695,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
amdsmi_interface.amdsmi_set_gpu_perf_determinism_mode(args.gpu, args.perfdeterminism)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {gpu_string}") from e
|
||||
|
||||
@@ -1721,13 +1757,16 @@ class AMDSMICommands():
|
||||
|
||||
args.gpu = device_handle
|
||||
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
if args.gpureset:
|
||||
if self.helpers.is_amd_device(args.gpu):
|
||||
try:
|
||||
amdsmi_interface.amdsmi_reset_gpu(args.gpu)
|
||||
result = 'Successfully reset GPU'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
result = "Failed to reset GPU"
|
||||
else:
|
||||
@@ -1742,30 +1781,30 @@ class AMDSMICommands():
|
||||
amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0)
|
||||
reset_clocks_results['overdrive'] = 'Overdrive set to 0'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
reset_clocks_results['overdrive'] = "N/A"
|
||||
logging.debug("Failed to reset overdrive on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO
|
||||
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
|
||||
reset_clocks_results['clocks'] = 'Successfully reset clocks'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
reset_clocks_results['clocks'] = "N/A"
|
||||
logging.debug("Failed to reset perf level on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO
|
||||
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
|
||||
reset_clocks_results['performance'] = 'Performance level reset to auto'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
reset_clocks_results['performance'] = "N/A"
|
||||
logging.debug("Failed to reset perf level on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results)
|
||||
if args.fans:
|
||||
@@ -1773,10 +1812,10 @@ class AMDSMICommands():
|
||||
amdsmi_interface.amdsmi_reset_gpu_fan(args.gpu, 0)
|
||||
result = 'Successfully reset fan speed to driver control'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
result = "N/A"
|
||||
logging.debug("Failed to reset fans on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.store_output(args.gpu, 'reset_fans', result)
|
||||
if args.profile:
|
||||
@@ -1787,20 +1826,20 @@ class AMDSMICommands():
|
||||
amdsmi_interface.amdsmi_set_gpu_power_profile(args.gpu, 0, power_profile_mask)
|
||||
reset_profile_results['power_profile'] = 'Successfully reset Power Profile'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
reset_profile_results['power_profile'] = "N/A"
|
||||
logging.debug("Failed to reset power profile on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO
|
||||
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
|
||||
reset_profile_results['performance_level'] = 'Successfully reset Performance Level'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
reset_profile_results['performance_level'] = "N/A"
|
||||
logging.debug("Failed to reset perf level on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results)
|
||||
if args.xgmierr:
|
||||
@@ -1808,10 +1847,10 @@ class AMDSMICommands():
|
||||
amdsmi_interface.amdsmi_reset_gpu_xgmi_error(args.gpu)
|
||||
result = 'Successfully reset XGMI Error count'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
result = "N/A"
|
||||
logging.debug("Failed to reset xgmi error count on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
self.logger.store_output(args.gpu, 'reset_xgmi_err', result)
|
||||
if args.perfdeterminism:
|
||||
try:
|
||||
@@ -1819,10 +1858,10 @@ class AMDSMICommands():
|
||||
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
|
||||
result = 'Successfully disabled performance determinism'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
result = "N/A"
|
||||
logging.debug("Failed to set perf level on gpu %s | %s", args.gpu, e.get_error_info())
|
||||
logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.store_output(args.gpu, 'reset_perf_determinism', result)
|
||||
|
||||
@@ -1857,7 +1896,7 @@ class AMDSMICommands():
|
||||
commands.logger.store_output(device, 'values', values_dict)
|
||||
commands.logger.print_output()
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.err_code != amdsmi_exception.AmdSmiRetCode.STATUS_NO_DATA:
|
||||
if e.err_code != amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DATA:
|
||||
print(e)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
@@ -37,6 +37,8 @@ from amdsmi import amdsmi_exception
|
||||
|
||||
# Using basic python logging for user errors and development
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.ERROR) # User level logging
|
||||
# This traceback limit only affects this file, once the code hit's the cli portion it get's reset to the user's preference
|
||||
sys.tracebacklimit = -1 # Disable traceback for user errors
|
||||
|
||||
# On initial import set initialized variable
|
||||
AMDSMI_INITIALIZED = False
|
||||
@@ -66,8 +68,7 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS):
|
||||
amdsmi_interface.amdsmi_init(flag)
|
||||
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as err:
|
||||
raise err
|
||||
|
||||
logging.info('AMDSMI initialized successfully') # without errors really
|
||||
logging.debug('AMDSMI initialized successfully')
|
||||
else:
|
||||
logging.error('Driver not initialized (amdgpu not found in modules)')
|
||||
exit(-1)
|
||||
|
||||
@@ -209,7 +209,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
json_help = "Displays output in JSON format (human readable by default)."
|
||||
csv_help = "Displays output in CSV format (human readable by default)."
|
||||
file_help = "Saves output into a file on the provided path (stdout by default)."
|
||||
loglevel_help = "Set the logging level for the parser commands"
|
||||
loglevel_help = "Set the logging level for the parser commands (ERROR by default)."
|
||||
|
||||
command_modifier_group = subcommand_parser.add_argument_group('Command Modifiers')
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ try:
|
||||
print("No GPUs on machine")
|
||||
except AmdSmiException as e:
|
||||
print("Error code: {}".format(e.err_code))
|
||||
if e.err_code == AmdSmiRetCode.STATUS_RETRY:
|
||||
if e.err_code == amdsmi_wrapper.AMDSMI_STATUS_RETRY:
|
||||
print("Error info: {}".format(e.err_info))
|
||||
```
|
||||
|
||||
|
||||
@@ -194,4 +194,3 @@ from .amdsmi_exception import AmdSmiKeyException
|
||||
from .amdsmi_exception import AmdSmiBdfFormatException
|
||||
from .amdsmi_exception import AmdSmiTimeoutException
|
||||
from .amdsmi_exception import AmdSmiException
|
||||
from .amdsmi_exception import AmdSmiRetCode
|
||||
|
||||
@@ -22,40 +22,9 @@
|
||||
from enum import IntEnum
|
||||
from . import amdsmi_wrapper
|
||||
|
||||
class AmdSmiRetCode(IntEnum):
|
||||
SUCCESS = amdsmi_wrapper.AMDSMI_STATUS_SUCCESS
|
||||
STATUS_INVAL = amdsmi_wrapper.AMDSMI_STATUS_INVAL
|
||||
STATUS_NOT_SUPPORTED = amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED
|
||||
STATUS_FILE_ERROR = amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR
|
||||
STATUS_NO_PERM = amdsmi_wrapper.AMDSMI_STATUS_NO_PERM
|
||||
STATUS_OUT_OF_RESOURCES = amdsmi_wrapper.AMDSMI_STATUS_OUT_OF_RESOURCES
|
||||
STATUS_INTERNAL_EXCEPTION = amdsmi_wrapper.AMDSMI_STATUS_INTERNAL_EXCEPTION
|
||||
STATUS_INPUT_OUT_OF_BOUNDS = amdsmi_wrapper.AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS
|
||||
STATUS_INIT_ERROR = amdsmi_wrapper.AMDSMI_STATUS_INIT_ERROR
|
||||
STATUS_NOT_YET_IMPLEMENTED = amdsmi_wrapper.AMDSMI_STATUS_NOT_YET_IMPLEMENTED
|
||||
STATUS_NOT_FOUND = amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND
|
||||
STATUS_INSUFFICIENT_SIZE = amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE
|
||||
STATUS_INTERRUPT = amdsmi_wrapper.AMDSMI_STATUS_INTERRUPT
|
||||
STATUS_UNEXPECTED_SIZE = amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE
|
||||
STATUS_NO_DATA = amdsmi_wrapper.AMDSMI_STATUS_NO_DATA
|
||||
STATUS_UNEXPECTED_DATA = amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_DATA
|
||||
STATUS_BUSY = amdsmi_wrapper.AMDSMI_STATUS_BUSY
|
||||
STATUS_REFCOUNT_OVERFLOW = amdsmi_wrapper.AMDSMI_STATUS_REFCOUNT_OVERFLOW
|
||||
STATUS_FAIL_LOAD_MODULE = amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_MODULE
|
||||
STATUS_FAIL_LOAD_SYMBOL = amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_SYMBOL
|
||||
STATUS_DRM_ERROR = amdsmi_wrapper.AMDSMI_STATUS_DRM_ERROR
|
||||
STATUS_IO = amdsmi_wrapper.AMDSMI_STATUS_IO
|
||||
STATUS_API_FAILED = amdsmi_wrapper.AMDSMI_STATUS_API_FAILED
|
||||
STATUS_TIMEOUT = amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT
|
||||
STATUS_NO_SLOT = amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT
|
||||
STATUS_RETRY = amdsmi_wrapper.AMDSMI_STATUS_RETRY
|
||||
STATUS_NOT_INIT = amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT
|
||||
UNKNOWN_ERROR = amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR
|
||||
|
||||
|
||||
class AmdSmiException(Exception):
|
||||
"""Base smi exception class"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@@ -67,7 +36,7 @@ class AmdSmiLibraryException(AmdSmiException):
|
||||
self.set_err_info()
|
||||
|
||||
def __str__(self):
|
||||
return "An error occured with code: {err_code}({err_info})".format(
|
||||
return "Error code:\n\t{err_code} | {err_info}".format(
|
||||
err_code=self.err_code, err_info=self.err_info
|
||||
)
|
||||
|
||||
@@ -77,34 +46,49 @@ class AmdSmiLibraryException(AmdSmiException):
|
||||
def get_error_code(self):
|
||||
return self.err_code
|
||||
|
||||
# Translate error codes to error strings
|
||||
def set_err_info(self):
|
||||
switch = {
|
||||
AmdSmiRetCode.STATUS_INVAL: "AMDSMI_STATUS_INVAL - Invalid parameters",
|
||||
AmdSmiRetCode.STATUS_NOT_SUPPORTED: "AMDSMI_STATUS_NOT_SUPPORTED - Feature not supported",
|
||||
AmdSmiRetCode.STATUS_FILE_ERROR: "AMDSMI_STATUS_FILE_ERROR - Error opening file",
|
||||
AmdSmiRetCode.STATUS_OUT_OF_RESOURCES: "AMDSMI_STATUS_OUT_OF_RESOURCES - Not enough memory",
|
||||
AmdSmiRetCode.STATUS_INTERNAL_EXCEPTION: "AMDSMI_STATUS_INTERNAL_EXCEPTION - Internal error",
|
||||
AmdSmiRetCode.STATUS_NO_PERM: "AMDSMI_STATUS_NO_PERM - Permission Denied",
|
||||
AmdSmiRetCode.STATUS_INPUT_OUT_OF_BOUNDS: "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS - Out of bounds",
|
||||
AmdSmiRetCode.STATUS_INIT_ERROR: "AMDSMI_STATUS_INIT_ERROR - Initialization error",
|
||||
AmdSmiRetCode.STATUS_BUSY: "AMDSMI_STATUS_BUSY - Device busy",
|
||||
AmdSmiRetCode.STATUS_NOT_FOUND: "AMDSMI_STATUS_NOT_FOUND - Device Not found",
|
||||
AmdSmiRetCode.STATUS_IO: "AMDSMI_STATUS_IO - I/O Error",
|
||||
AmdSmiRetCode.STATUS_NOT_YET_IMPLEMENTED: "AMDSMI_STATUS_NOT_YET_IMPLEMENTED - Feature not yet implemented",
|
||||
AmdSmiRetCode.STATUS_INSUFFICIENT_SIZE: "AMDSMI_STATUS_INSUFFICIENT_SIZE - Insufficient size for operation",
|
||||
AmdSmiRetCode.STATUS_INTERRUPT: "AMDSMI_STATUS_INTERRUPT - Interrupt ocurred during execution",
|
||||
AmdSmiRetCode.STATUS_UNEXPECTED_SIZE: "AMDSMI_STATUS_UNEXPECTED_SIZE - unexpected size of data was read",
|
||||
AmdSmiRetCode.STATUS_NO_DATA: "AMDSMI_STATUS_NO_DATA - No data was found for given input",
|
||||
AmdSmiRetCode.STATUS_UNEXPECTED_DATA: "AMDSMI_STATUS_UNEXPECTED_DATA - The data read or provided was unexpected",
|
||||
AmdSmiRetCode.STATUS_REFCOUNT_OVERFLOW: "AMDSMI_STATUS_REFCOUNT_OVERFLOW - Internal reference counter exceeded INT32_MAX",
|
||||
AmdSmiRetCode.STATUS_FAIL_LOAD_MODULE: "AMDSMI_STATUS_FAIL_LOAD_MODULE - Fail to load lib",
|
||||
AmdSmiRetCode.STATUS_FAIL_LOAD_SYMBOL: "AMDSMI_STATUS_FAIL_LOAD_SYMBOL - Fail to load symbol",
|
||||
AmdSmiRetCode.STATUS_DRM_ERROR: "AMDSMI_STATUS_DRM_ERROR - Error when called libdrm",
|
||||
AmdSmiRetCode.STATUS_API_FAILED: "AMDSMI_STATUS_API_FAILED - API call failed",
|
||||
AmdSmiRetCode.STATUS_TIMEOUT: "AMDSMI_STATUS_TIMEOUT - Timeout in API call",
|
||||
AmdSmiRetCode.STATUS_NO_SLOT: "AMDSMI_STATUS_NO_SLOT - No more free slot",
|
||||
AmdSmiRetCode.STATUS_RETRY: "AMDSMI_STATUS_RETRY - Retry operation",
|
||||
AmdSmiRetCode.STATUS_NOT_INIT: "AMDSMI_STATUS_NOT_INIT - Device not initialized",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_INVAL : "AMDSMI_STATUS_INVAL - Invalid parameters",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED : "AMDSMI_STATUS_NOT_SUPPORTED - Feature not supported",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NOT_YET_IMPLEMENTED : "AMDSMI_STATUS_NOT_YET_IMPLEMENTED - Feature not yet implemented",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_MODULE : "AMDSMI_STATUS_FAIL_LOAD_MODULE - Fail to load lib",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_SYMBOL : "AMDSMI_STATUS_FAIL_LOAD_SYMBOL - Fail to load symbol",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_DRM_ERROR : "AMDSMI_STATUS_DRM_ERROR - Error when called libdrm",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_API_FAILED : "AMDSMI_STATUS_API_FAILED - API call failed",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT : "AMDSMI_STATUS_TIMEOUT - Timeout in API call",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_RETRY : "AMDSMI_STATUS_RETRY - Retry operation",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NO_PERM : "AMDSMI_STATUS_NO_PERM - Permission Denied",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_INTERRUPT : "AMDSMI_STATUS_INTERRUPT - Interrupt ocurred during execution",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_IO : "AMDSMI_STATUS_IO - I/O Error",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_ADDRESS_FAULT : "AMDSMI_STATUS_ADDRESS_FAULT - Bad address",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR : "AMDSMI_STATUS_FILE_ERROR - Error opening file",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_OUT_OF_RESOURCES : "AMDSMI_STATUS_OUT_OF_RESOURCES - Not enough memory",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_INTERNAL_EXCEPTION : "AMDSMI_STATUS_INTERNAL_EXCEPTION - Internal error",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS : "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS - Out of bounds",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_INIT_ERROR : "AMDSMI_STATUS_INIT_ERROR - Initialization error",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_REFCOUNT_OVERFLOW : "AMDSMI_STATUS_REFCOUNT_OVERFLOW - Internal reference counter exceeded INT32_MAX",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_BUSY : "AMDSMI_STATUS_BUSY - Device busy",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND : "AMDSMI_STATUS_NOT_FOUND - Device Not found",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT : "AMDSMI_STATUS_NOT_INIT - Device not initialized",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT : "AMDSMI_STATUS_NO_SLOT - No more free slot",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Processor driver not loaded",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NO_DATA : "AMDSMI_STATUS_NO_DATA - No data was found for given input",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE : "AMDSMI_STATUS_INSUFFICIENT_SIZE - Insufficient size for operation",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE : "AMDSMI_STATUS_UNEXPECTED_SIZE - unexpected size of data was read",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_DATA : "AMDSMI_STATUS_UNEXPECTED_DATA - The data read or provided was unexpected",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_NON_AMD_CPU : "AMDSMI_STATUS_NON_AMD_CPU - System has non-AMD CPU",
|
||||
amdsmi_wrapper.AMDSMI_NO_ENERGY_DRV : "AMD_SMI_NO_ENERGY_DRV - Energy driver not found",
|
||||
amdsmi_wrapper.AMDSMI_NO_MSR_DRV : "AMDSMI_NO_MSR_DRV - MSR driver not found",
|
||||
amdsmi_wrapper.AMDSMI_NO_HSMP_DRV : "AMD_SMI_NO_HSMP_DRV - HSMP driver not found",
|
||||
amdsmi_wrapper.AMDSMI_NO_HSMP_SUP : "AMD_SMI_NO_HSMP_SUP - HSMP not supported",
|
||||
amdsmi_wrapper.AMDSMI_NO_HSMP_MSG_SUP : "AMD_SMI_NO_HSMP_MSG_SUP - HSMP message/feature not supported",
|
||||
amdsmi_wrapper.AMDSMI_HSMP_TIMEOUT : "AMD_SMI_HSMP_TIMEOUT - HSMP message timeout",
|
||||
amdsmi_wrapper.AMDSMI_NO_DRV : "AMDSMI_NO_DRV - No Energy and HSMP driver present",
|
||||
amdsmi_wrapper.AMDSMI_FILE_NOT_FOUND : "AMDSMI_FILE_NOT_FOUND - File or directory not found",
|
||||
amdsmi_wrapper.AMDSMI_ARG_PTR_NULL : "AMDSMI_ARG_PTR_NULL - Parsed argument is invalid",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_MAP_ERROR : "AMDSMI_STATUS_MAP_ERROR - The internal library error did not map to a status code",
|
||||
amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR : "AMDSMI_STATUS_UNKNOWN_ERROR - An unknown error occurred"
|
||||
}
|
||||
|
||||
self.err_info = switch.get(self.err_code, "AMDSMI_STATUS_UNKNOWN_ERROR - An unknown error occurred")
|
||||
@@ -112,12 +96,12 @@ class AmdSmiLibraryException(AmdSmiException):
|
||||
|
||||
class AmdSmiRetryException(AmdSmiLibraryException):
|
||||
def __init__(self):
|
||||
super().__init__(AmdSmiRetCode.RETRY)
|
||||
super().__init__(amdsmi_wrapper.AMDSMI_STATUS_RETRY)
|
||||
|
||||
|
||||
class AmdSmiTimeoutException(AmdSmiLibraryException):
|
||||
def __init__(self):
|
||||
super().__init__(AmdSmiRetCode.TIMEOUT)
|
||||
super().__init__(amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)
|
||||
|
||||
|
||||
class AmdSmiParameterException(AmdSmiException):
|
||||
|
||||
Reference in New Issue
Block a user