diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 0805064c99..f687cd63b8 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -102,7 +102,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ``` bash @@ -123,7 +123,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ```bash @@ -153,7 +153,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ```bash @@ -176,7 +176,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ```bash @@ -215,7 +215,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ```bash @@ -245,7 +245,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ```bash @@ -271,7 +271,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ```bash @@ -296,7 +296,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ```bash @@ -323,7 +323,7 @@ Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). - --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands (ERROR by default). ``` ## Disclaimer diff --git a/amdsmi_cli/amdsmi_cli.py b/amdsmi_cli/amdsmi_cli.py index a696a774a8..d1232b2b26 100755 --- a/amdsmi_cli/amdsmi_cli.py +++ b/amdsmi_cli/amdsmi_cli.py @@ -66,14 +66,22 @@ if __name__ == "__main__": amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value if args.file: amd_smi_commands.logger.destination = args.file - if args.loglevel: - logging_dict = {'DEBUG' : logging.DEBUG, - 'INFO' : logging.INFO, - 'WARNING': logging.WARNING, - 'ERROR': logging.ERROR, - 'CRITICAL': logging.CRITICAL} - # Enable debug logs on amdsmi library ie. RSMI_LOGGING = 1 in environment or otherwise - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel]) + + # Remove previous log handlers + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + + logging_dict = {'DEBUG' : logging.DEBUG, + 'INFO' : logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL} + # To enable debug logs on rocm-smi library set RSMI_LOGGING = 1 in environment + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel]) + + # Disable traceback for non-debug log levels + if args.loglevel != "DEBUG": + sys.tracebacklimit = -1 # Execute subcommands args.func(args) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 179296ff40..f5020d708f 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -191,6 +191,9 @@ class AMDSMICommands(): static_dict = {} + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + if args.asic: try: asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu) @@ -203,7 +206,7 @@ class AMDSMICommands(): static_dict['asic'] = asic_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['asic'] = "N/A" - logging.debug("Failed to get asic info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) if args.bus: bus_output_info = {} @@ -239,13 +242,13 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: bus_info = "N/A" - logging.debug("Failed to get bus info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info()) try: bus_output_info['bdf'] = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: bus_output_info['bdf'] = "N/A" - logging.debug("Failed to get bdf for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info()) bus_output_info.update(bus_info) static_dict['bus'] = bus_output_info @@ -255,7 +258,7 @@ class AMDSMICommands(): static_dict['vbios'] = vbios_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['vbios'] = "N/A" - logging.debug("Failed to get vbios info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get vbios info for gpu %s | %s", gpu_id, e.get_error_info()) if self.helpers.is_linux() and self.helpers.is_baremetal(): if args.board: @@ -271,7 +274,7 @@ class AMDSMICommands(): static_dict['board'] = board_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['board'] = "N/A" - logging.debug("Failed to get board info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info()) if args.limit: # Power limits try: @@ -283,7 +286,7 @@ class AMDSMICommands(): power_limit_error = True max_power_limit = "N/A" current_power_limit = "N/A" - logging.debug("Failed to get power cap info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info()) # Edge temperature limits try: @@ -293,7 +296,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: slowdown_temp_edge_limit_error = True slowdown_temp_edge_limit = "N/A" - logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", gpu_id, e.get_error_info()) if slowdown_temp_edge_limit == 0: slowdown_temp_edge_limit_error = True @@ -306,7 +309,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: shutdown_temp_edge_limit_error = True shutdown_temp_edge_limit = "N/A" - logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) if shutdown_temp_edge_limit == 0: shutdown_temp_edge_limit_error = True @@ -320,7 +323,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: slowdown_temp_hotspot_limit_error = True slowdown_temp_hotspot_limit = "N/A" - logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) try: shutdown_temp_hotspot_limit_error = False @@ -329,7 +332,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: shutdown_temp_hotspot_limit_error = True shutdown_temp_hotspot_limit = "N/A" - logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) # VRAM temperature limits @@ -340,7 +343,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: slowdown_temp_vram_limit_error = True slowdown_temp_vram_limit = "N/A" - logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) try: shutdown_temp_vram_limit_error = False @@ -349,7 +352,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: shutdown_temp_vram_limit_error = True shutdown_temp_vram_limit = "N/A" - logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) if self.logger.is_human_readable_format(): unit = 'W' @@ -392,7 +395,7 @@ class AMDSMICommands(): static_dict['driver'] = driver_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['driver'] = "N/A" - logging.debug("Failed to get driver info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get driver info for gpu %s | %s", gpu_id, e.get_error_info()) if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): if args.ras: @@ -400,7 +403,7 @@ class AMDSMICommands(): static_dict['ras'] = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: static_dict['ras'] = "N/A" - logging.debug("Failed to get ras block features for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info()) if args.vram: try: vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu) @@ -417,7 +420,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: vram_info = "N/A" - logging.debug("Failed to get vram info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['vram'] = vram_info @@ -427,13 +430,13 @@ class AMDSMICommands(): numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: numa_node_number = "N/A" - logging.debug("Failed to get numa node number for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info()) try: numa_affinity = amdsmi_interface.amdsmi_get_gpu_topo_numa_affinity(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: numa_affinity = "N/A" - logging.debug("Failed to get numa affinity for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['numa'] = {'node' : numa_node_number, 'affinity' : numa_affinity} @@ -500,6 +503,10 @@ class AMDSMICommands(): args.gpu = device_handle fw_list = {} + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + if args.fw_list: try: fw_info = amdsmi_interface.amdsmi_get_fw_info(args.gpu) @@ -519,7 +526,7 @@ class AMDSMICommands(): fw_list.update(fw_info) except amdsmi_exception.AmdSmiLibraryException as e: fw_list['fw_list'] = "N/A" - logging.debug("Failed to get firmware info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get firmware info for gpu %s | %s", gpu_id, e.get_error_info()) multiple_devices_csv_override = False # Convert and store output by pid for csv format @@ -586,13 +593,16 @@ class AMDSMICommands(): values_dict = {} bad_page_err_output = '' + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + try: bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu) bad_page_error = False except amdsmi_exception.AmdSmiLibraryException as e: bad_page_error = True bad_page_err_output = "N/A" - logging.debug("Failed to get bad page info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) if bad_page_info == "No bad pages found.": bad_page_error = True @@ -798,6 +808,10 @@ class AMDSMICommands(): # Add timestamp and store values for specified arguments values_dict = {} + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + if self.helpers.is_linux() and self.helpers.is_baremetal(): if args.usage: try: @@ -818,7 +832,7 @@ class AMDSMICommands(): values_dict['usage'] = engine_usage except amdsmi_exception.AmdSmiLibraryException as e: values_dict['usage'] = "N/A" - logging.debug("Failed to get gpu activity for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get gpu activity for gpu %s | %s", gpu_id, e.get_error_info()) if args.power: power_dict = {'current_power': "N/A", 'current_gfx_voltage': "N/A", @@ -845,7 +859,7 @@ class AMDSMICommands(): power_dict['power_limit'] = power_info['power_limit'] except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get power info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get power info for gpu %s | %s", gpu_id, e.get_error_info()) try: is_power_management_enabled = amdsmi_interface.amdsmi_is_gpu_power_management_enabled(args.gpu) @@ -854,7 +868,7 @@ class AMDSMICommands(): else: power_dict['power_management'] = "DISABLED" except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get power management status for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['power'] = power_dict if args.clock: @@ -870,20 +884,20 @@ class AMDSMICommands(): clocks['gfx'] = gfx_clock except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get gfx clock info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info()) try: # is_clk_locked = amdsmi_interface.amdsmi_is_clk_locked(args.gpu, amdsmi_interface.AmdSmiClkType.GFX) is_clk_locked = "N/A" except amdsmi_exception.AmdSmiLibraryException as e: is_clk_locked = "N/A" - logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", args.gpu, e.get_error_info()) - + logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info()) + if isinstance(clocks['gfx'], dict): clocks['gfx']['is_clk_locked'] = is_clk_locked else: clocks['gfx'] = {'is_clk_locked': is_clk_locked} - + try: mem_clock = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_interface.AmdSmiClkType.MEM) @@ -894,7 +908,7 @@ class AMDSMICommands(): clocks['mem'] = mem_clock except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get mem clock info for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['clock'] = clocks if args.temperature: @@ -903,14 +917,14 @@ class AMDSMICommands(): args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) except amdsmi_exception.AmdSmiLibraryException as e: temperature_edge_current = "N/A" - logging.debug("Failed to get current edge temperature for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get current edge temperature for gpu %s | %s", gpu_id, e.get_error_info()) try: temperature_edge_limit = amdsmi_interface.amdsmi_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) except amdsmi_exception.AmdSmiLibraryException as e: temperature_edge_limit = "N/A" - logging.debug("Failed to get edge temperature limit for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get edge temperature limit for gpu %s | %s", gpu_id, e.get_error_info()) # If edge limit is reporting 0 then set the current edge temp to N/A if temperature_edge_limit == 0: @@ -921,14 +935,14 @@ class AMDSMICommands(): args.gpu, amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) except amdsmi_exception.AmdSmiLibraryException as e: temperature_hotspot_current = "N/A" - logging.debug("Failed to get current hotspot temperature for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get current hotspot temperature for gpu %s | %s", gpu_id, e.get_error_info()) try: temperature_vram_current = amdsmi_interface.amdsmi_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) except amdsmi_exception.AmdSmiLibraryException as e: temperature_vram_current = "N/A" - logging.debug("Failed to get current vram temperature for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get current vram temperature for gpu %s | %s", gpu_id, e.get_error_info()) temperatures = {'edge': temperature_edge_current, 'hotspot': temperature_hotspot_current, @@ -950,7 +964,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: ecc_count['correctable'] = "N/A" ecc_count['uncorrectable'] = "N/A" - logging.debug("Failed to get ecc count for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get ecc count for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['ecc'] = ecc_count if args.ecc_block: @@ -966,14 +980,14 @@ class AMDSMICommands(): 'uncorrectable': ecc_count['uncorrectable_count']} except amdsmi_exception.AmdSmiLibraryException as e: ecc_count = "N/A" - logging.debug("Failed to get ecc count for gpu %s at block %s | %s", args.gpu, gpu_block, e.get_error_info()) + logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info()) ecc_dict[state['block']] = {'correctable' : ecc_count, 'uncorrectable': ecc_count} values_dict['ecc_block'] = ecc_dict except amdsmi_exception.AmdSmiLibraryException as e: values_dict['ecc_block'] = "N/A" - logging.debug("Failed to get ecc block features for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get ecc block features for gpu %s | %s", gpu_id, e.get_error_info()) if args.pcie: pcie_dict = {'current_width': "N/A", 'current_speed': "N/A", @@ -997,13 +1011,13 @@ class AMDSMICommands(): unit = 'GT/s' pcie_link_status['current_speed'] = f"{pcie_link_status['pcie_speed']} {unit}" except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie link status for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) try: pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) pcie_dict['replay_count'] = pci_replay_counter except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pci replay counter for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) try: pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) @@ -1024,7 +1038,7 @@ class AMDSMICommands(): pcie_dict['current_bandwith_received'] = received pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz'] except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie bandwidth for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['pcie'] = pcie_dict if args.fan: @@ -1077,7 +1091,7 @@ class AMDSMICommands(): values_dict['voltage_curve'] = voltage_point_dict except amdsmi_exception.AmdSmiLibraryException as e: values_dict['voltage_curve'] = "N/A" - logging.debug("Failed to get voltage curve for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info()) if args.overdrive: try: overdrive_level = amdsmi_interface.amdsmi_get_gpu_overdrive_level(args.gpu) @@ -1089,14 +1103,14 @@ class AMDSMICommands(): values_dict['overdrive'] = overdrive_level except amdsmi_exception.AmdSmiLibraryException as e: values_dict['overdrive'] = "N/A" - logging.debug("Failed to get overdrive level for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get overdrive level for gpu %s | %s", gpu_id, e.get_error_info()) if args.perf_level: try: perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu) values_dict['perf_level'] = perf_level except amdsmi_exception.AmdSmiLibraryException as e: values_dict['perf_level'] = "N/A" - logging.debug("Failed to get perf level for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get perf level for gpu %s | %s", gpu_id, e.get_error_info()) if self.helpers.is_linux() and self.helpers.is_baremetal(): if args.xgmi_err: @@ -1104,7 +1118,7 @@ class AMDSMICommands(): values_dict['xgmi_err'] = amdsmi_interface.amdsmi_gpu_xgmi_error_status(args.gpu) except amdsmi_interface.AmdSmiLibraryException as e: values_dict['xgmi_err'] = "N/A" - logging.debug("Failed to get xgmi error status for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info()) if args.energy: try: energy_dict = amdsmi_interface.amdsmi_get_energy_count(args.gpu) @@ -1140,19 +1154,19 @@ class AMDSMICommands(): total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) memory_usage['total_vram'] = total_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get total VRAM memory for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get total VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: total_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) memory_usage['total_visible_vram'] = total_visible_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: total_gtt = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) memory_usage['total_gtt'] = total_gtt // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get total GTT memory for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get total GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) # Used VRAM try: @@ -1160,19 +1174,19 @@ class AMDSMICommands(): memory_usage['used_vram'] = used_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get used VRAM memory for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get used VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: used_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) memory_usage['used_visible_vram'] = used_visible_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: used_gtt = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) memory_usage['used_gtt'] = used_gtt // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get used GTT memory for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get used GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) # Free VRAM if memory_usage['total_vram'] != "N/A" and memory_usage['used_vram'] != "N/A": @@ -1288,11 +1302,14 @@ class AMDSMICommands(): else: raise IndexError("args.gpu should not be an empty list") + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + # Populate initial processes try: process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get process list for gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) raise e filtered_process_values = [] @@ -1301,7 +1318,7 @@ class AMDSMICommands(): process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process_handle) except amdsmi_exception.AmdSmiLibraryException as e: process_info = "N/A" - logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", args.gpu, process_handle, e.get_error_info()) + logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", gpu_id, process_handle, e.get_error_info()) filtered_process_values.append({'process_info': process_info}) continue @@ -1467,7 +1484,10 @@ class AMDSMICommands(): src_gpu_links[dest_gpu_key] = bool(dest_gpu_link_status) except amdsmi_exception.AmdSmiLibraryException as e: src_gpu_links[dest_gpu_key] = "N/A" - logging.debug("Failed to get link status for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info()) + logging.debug("Failed to get link status for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links @@ -1487,7 +1507,10 @@ class AMDSMICommands(): src_gpu_weight[dest_gpu_key] = dest_gpu_link_weight except amdsmi_exception.AmdSmiLibraryException as e: src_gpu_weight[dest_gpu_key] = "N/A" - logging.debug("Failed to get link weight for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info()) + logging.debug("Failed to get link weight for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) topo_values[src_gpu_index]['weight'] = src_gpu_weight @@ -1507,7 +1530,10 @@ class AMDSMICommands(): src_gpu_hops[dest_gpu_key] = dest_gpu_hops except amdsmi_exception.AmdSmiLibraryException as e: src_gpu_hops[dest_gpu_key] = "N/A" - logging.debug("Failed to get link hops for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info()) + logging.debug("Failed to get link hops for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) topo_values[src_gpu_index]['hops'] = src_gpu_hops @@ -1532,7 +1558,10 @@ class AMDSMICommands(): src_gpu_link_type[dest_gpu_key] = "XGMI" except amdsmi_exception.AmdSmiLibraryException as e: src_gpu_link_type[dest_gpu_key] = "N/A" - logging.debug("Failed to get link type for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info()) + logging.debug("Failed to get link type for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) topo_values[src_gpu_index]['link_type'] = src_gpu_link_type @@ -1556,7 +1585,10 @@ class AMDSMICommands(): continue except amdsmi_exception.AmdSmiLibraryException as e: src_gpu_link_type[dest_gpu_key] = "N/A" - logging.debug("Failed to get link type for %s to %s | %s", src_gpu, dest_gpu, e.get_error_info()) + logging.debug("Failed to get link type for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) try: min_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu)['min_bandwidth'] @@ -1565,6 +1597,10 @@ class AMDSMICommands(): src_gpu_link_type[dest_gpu_key] = f'{min_bw}-{max_bw}' except amdsmi_exception.AmdSmiLibraryException as e: src_gpu_link_type[dest_gpu_key] = e.get_error_info() + logging.debug("Failed to get min max bandwidth for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type @@ -1638,7 +1674,7 @@ class AMDSMICommands(): try: amdsmi_interface.amdsmi_set_gpu_fan_speed(args.gpu, 0, args.fan) except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e @@ -1648,7 +1684,7 @@ class AMDSMICommands(): try: amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level) except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e @@ -1659,7 +1695,7 @@ class AMDSMICommands(): try: amdsmi_interface.amdsmi_set_gpu_perf_determinism_mode(args.gpu, args.perfdeterminism) except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {gpu_string}") from e @@ -1721,13 +1757,16 @@ class AMDSMICommands(): args.gpu = device_handle + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + if args.gpureset: if self.helpers.is_amd_device(args.gpu): try: amdsmi_interface.amdsmi_reset_gpu(args.gpu) result = 'Successfully reset GPU' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e result = "Failed to reset GPU" else: @@ -1742,30 +1781,30 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0) reset_clocks_results['overdrive'] = 'Overdrive set to 0' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_clocks_results['overdrive'] = "N/A" - logging.debug("Failed to reset overdrive on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info()) try: level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) reset_clocks_results['clocks'] = 'Successfully reset clocks' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_clocks_results['clocks'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) try: level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) reset_clocks_results['performance'] = 'Performance level reset to auto' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_clocks_results['performance'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results) if args.fans: @@ -1773,10 +1812,10 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_reset_gpu_fan(args.gpu, 0) result = 'Successfully reset fan speed to driver control' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e result = "N/A" - logging.debug("Failed to reset fans on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_fans', result) if args.profile: @@ -1787,20 +1826,20 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_set_gpu_power_profile(args.gpu, 0, power_profile_mask) reset_profile_results['power_profile'] = 'Successfully reset Power Profile' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_profile_results['power_profile'] = "N/A" - logging.debug("Failed to reset power profile on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info()) try: level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) reset_profile_results['performance_level'] = 'Successfully reset Performance Level' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_profile_results['performance_level'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results) if args.xgmierr: @@ -1808,10 +1847,10 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_reset_gpu_xgmi_error(args.gpu) result = 'Successfully reset XGMI Error count' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e result = "N/A" - logging.debug("Failed to reset xgmi error count on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_xgmi_err', result) if args.perfdeterminism: try: @@ -1819,10 +1858,10 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) result = 'Successfully disabled performance determinism' except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e result = "N/A" - logging.debug("Failed to set perf level on gpu %s | %s", args.gpu, e.get_error_info()) + logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_perf_determinism', result) @@ -1857,7 +1896,7 @@ class AMDSMICommands(): commands.logger.store_output(device, 'values', values_dict) commands.logger.print_output() except amdsmi_exception.AmdSmiLibraryException as e: - if e.err_code != amdsmi_exception.AmdSmiRetCode.STATUS_NO_DATA: + if e.err_code != amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DATA: print(e) except Exception as e: print(e) diff --git a/amdsmi_cli/amdsmi_init.py b/amdsmi_cli/amdsmi_init.py index 5f201d7d15..7e837026cc 100644 --- a/amdsmi_cli/amdsmi_init.py +++ b/amdsmi_cli/amdsmi_init.py @@ -37,6 +37,8 @@ from amdsmi import amdsmi_exception # Using basic python logging for user errors and development logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.ERROR) # User level logging +# This traceback limit only affects this file, once the code hit's the cli portion it get's reset to the user's preference +sys.tracebacklimit = -1 # Disable traceback for user errors # On initial import set initialized variable AMDSMI_INITIALIZED = False @@ -66,8 +68,7 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS): amdsmi_interface.amdsmi_init(flag) except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as err: raise err - - logging.info('AMDSMI initialized successfully') # without errors really + logging.debug('AMDSMI initialized successfully') else: logging.error('Driver not initialized (amdgpu not found in modules)') exit(-1) diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index e58344598a..65cb7bd85c 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -209,7 +209,7 @@ class AMDSMIParser(argparse.ArgumentParser): json_help = "Displays output in JSON format (human readable by default)." csv_help = "Displays output in CSV format (human readable by default)." file_help = "Saves output into a file on the provided path (stdout by default)." - loglevel_help = "Set the logging level for the parser commands" + loglevel_help = "Set the logging level for the parser commands (ERROR by default)." command_modifier_group = subcommand_parser.add_argument_group('Command Modifiers') diff --git a/py-interface/README.md b/py-interface/README.md index ca39ab110c..6efba3b16d 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -60,7 +60,7 @@ try: print("No GPUs on machine") except AmdSmiException as e: print("Error code: {}".format(e.err_code)) - if e.err_code == AmdSmiRetCode.STATUS_RETRY: + if e.err_code == amdsmi_wrapper.AMDSMI_STATUS_RETRY: print("Error info: {}".format(e.err_info)) ``` diff --git a/py-interface/__init__.py b/py-interface/__init__.py index 050a25a2c1..6be5f36436 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -194,4 +194,3 @@ from .amdsmi_exception import AmdSmiKeyException from .amdsmi_exception import AmdSmiBdfFormatException from .amdsmi_exception import AmdSmiTimeoutException from .amdsmi_exception import AmdSmiException -from .amdsmi_exception import AmdSmiRetCode diff --git a/py-interface/amdsmi_exception.py b/py-interface/amdsmi_exception.py index 046659e459..162e4f7a00 100644 --- a/py-interface/amdsmi_exception.py +++ b/py-interface/amdsmi_exception.py @@ -22,40 +22,9 @@ from enum import IntEnum from . import amdsmi_wrapper -class AmdSmiRetCode(IntEnum): - SUCCESS = amdsmi_wrapper.AMDSMI_STATUS_SUCCESS - STATUS_INVAL = amdsmi_wrapper.AMDSMI_STATUS_INVAL - STATUS_NOT_SUPPORTED = amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED - STATUS_FILE_ERROR = amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR - STATUS_NO_PERM = amdsmi_wrapper.AMDSMI_STATUS_NO_PERM - STATUS_OUT_OF_RESOURCES = amdsmi_wrapper.AMDSMI_STATUS_OUT_OF_RESOURCES - STATUS_INTERNAL_EXCEPTION = amdsmi_wrapper.AMDSMI_STATUS_INTERNAL_EXCEPTION - STATUS_INPUT_OUT_OF_BOUNDS = amdsmi_wrapper.AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS - STATUS_INIT_ERROR = amdsmi_wrapper.AMDSMI_STATUS_INIT_ERROR - STATUS_NOT_YET_IMPLEMENTED = amdsmi_wrapper.AMDSMI_STATUS_NOT_YET_IMPLEMENTED - STATUS_NOT_FOUND = amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND - STATUS_INSUFFICIENT_SIZE = amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE - STATUS_INTERRUPT = amdsmi_wrapper.AMDSMI_STATUS_INTERRUPT - STATUS_UNEXPECTED_SIZE = amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE - STATUS_NO_DATA = amdsmi_wrapper.AMDSMI_STATUS_NO_DATA - STATUS_UNEXPECTED_DATA = amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_DATA - STATUS_BUSY = amdsmi_wrapper.AMDSMI_STATUS_BUSY - STATUS_REFCOUNT_OVERFLOW = amdsmi_wrapper.AMDSMI_STATUS_REFCOUNT_OVERFLOW - STATUS_FAIL_LOAD_MODULE = amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_MODULE - STATUS_FAIL_LOAD_SYMBOL = amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_SYMBOL - STATUS_DRM_ERROR = amdsmi_wrapper.AMDSMI_STATUS_DRM_ERROR - STATUS_IO = amdsmi_wrapper.AMDSMI_STATUS_IO - STATUS_API_FAILED = amdsmi_wrapper.AMDSMI_STATUS_API_FAILED - STATUS_TIMEOUT = amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT - STATUS_NO_SLOT = amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT - STATUS_RETRY = amdsmi_wrapper.AMDSMI_STATUS_RETRY - STATUS_NOT_INIT = amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT - UNKNOWN_ERROR = amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR - class AmdSmiException(Exception): """Base smi exception class""" - pass @@ -67,7 +36,7 @@ class AmdSmiLibraryException(AmdSmiException): self.set_err_info() def __str__(self): - return "An error occured with code: {err_code}({err_info})".format( + return "Error code:\n\t{err_code} | {err_info}".format( err_code=self.err_code, err_info=self.err_info ) @@ -77,34 +46,49 @@ class AmdSmiLibraryException(AmdSmiException): def get_error_code(self): return self.err_code + # Translate error codes to error strings def set_err_info(self): switch = { - AmdSmiRetCode.STATUS_INVAL: "AMDSMI_STATUS_INVAL - Invalid parameters", - AmdSmiRetCode.STATUS_NOT_SUPPORTED: "AMDSMI_STATUS_NOT_SUPPORTED - Feature not supported", - AmdSmiRetCode.STATUS_FILE_ERROR: "AMDSMI_STATUS_FILE_ERROR - Error opening file", - AmdSmiRetCode.STATUS_OUT_OF_RESOURCES: "AMDSMI_STATUS_OUT_OF_RESOURCES - Not enough memory", - AmdSmiRetCode.STATUS_INTERNAL_EXCEPTION: "AMDSMI_STATUS_INTERNAL_EXCEPTION - Internal error", - AmdSmiRetCode.STATUS_NO_PERM: "AMDSMI_STATUS_NO_PERM - Permission Denied", - AmdSmiRetCode.STATUS_INPUT_OUT_OF_BOUNDS: "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS - Out of bounds", - AmdSmiRetCode.STATUS_INIT_ERROR: "AMDSMI_STATUS_INIT_ERROR - Initialization error", - AmdSmiRetCode.STATUS_BUSY: "AMDSMI_STATUS_BUSY - Device busy", - AmdSmiRetCode.STATUS_NOT_FOUND: "AMDSMI_STATUS_NOT_FOUND - Device Not found", - AmdSmiRetCode.STATUS_IO: "AMDSMI_STATUS_IO - I/O Error", - AmdSmiRetCode.STATUS_NOT_YET_IMPLEMENTED: "AMDSMI_STATUS_NOT_YET_IMPLEMENTED - Feature not yet implemented", - AmdSmiRetCode.STATUS_INSUFFICIENT_SIZE: "AMDSMI_STATUS_INSUFFICIENT_SIZE - Insufficient size for operation", - AmdSmiRetCode.STATUS_INTERRUPT: "AMDSMI_STATUS_INTERRUPT - Interrupt ocurred during execution", - AmdSmiRetCode.STATUS_UNEXPECTED_SIZE: "AMDSMI_STATUS_UNEXPECTED_SIZE - unexpected size of data was read", - AmdSmiRetCode.STATUS_NO_DATA: "AMDSMI_STATUS_NO_DATA - No data was found for given input", - AmdSmiRetCode.STATUS_UNEXPECTED_DATA: "AMDSMI_STATUS_UNEXPECTED_DATA - The data read or provided was unexpected", - AmdSmiRetCode.STATUS_REFCOUNT_OVERFLOW: "AMDSMI_STATUS_REFCOUNT_OVERFLOW - Internal reference counter exceeded INT32_MAX", - AmdSmiRetCode.STATUS_FAIL_LOAD_MODULE: "AMDSMI_STATUS_FAIL_LOAD_MODULE - Fail to load lib", - AmdSmiRetCode.STATUS_FAIL_LOAD_SYMBOL: "AMDSMI_STATUS_FAIL_LOAD_SYMBOL - Fail to load symbol", - AmdSmiRetCode.STATUS_DRM_ERROR: "AMDSMI_STATUS_DRM_ERROR - Error when called libdrm", - AmdSmiRetCode.STATUS_API_FAILED: "AMDSMI_STATUS_API_FAILED - API call failed", - AmdSmiRetCode.STATUS_TIMEOUT: "AMDSMI_STATUS_TIMEOUT - Timeout in API call", - AmdSmiRetCode.STATUS_NO_SLOT: "AMDSMI_STATUS_NO_SLOT - No more free slot", - AmdSmiRetCode.STATUS_RETRY: "AMDSMI_STATUS_RETRY - Retry operation", - AmdSmiRetCode.STATUS_NOT_INIT: "AMDSMI_STATUS_NOT_INIT - Device not initialized", + amdsmi_wrapper.AMDSMI_STATUS_INVAL : "AMDSMI_STATUS_INVAL - Invalid parameters", + amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED : "AMDSMI_STATUS_NOT_SUPPORTED - Feature not supported", + amdsmi_wrapper.AMDSMI_STATUS_NOT_YET_IMPLEMENTED : "AMDSMI_STATUS_NOT_YET_IMPLEMENTED - Feature not yet implemented", + amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_MODULE : "AMDSMI_STATUS_FAIL_LOAD_MODULE - Fail to load lib", + amdsmi_wrapper.AMDSMI_STATUS_FAIL_LOAD_SYMBOL : "AMDSMI_STATUS_FAIL_LOAD_SYMBOL - Fail to load symbol", + amdsmi_wrapper.AMDSMI_STATUS_DRM_ERROR : "AMDSMI_STATUS_DRM_ERROR - Error when called libdrm", + amdsmi_wrapper.AMDSMI_STATUS_API_FAILED : "AMDSMI_STATUS_API_FAILED - API call failed", + amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT : "AMDSMI_STATUS_TIMEOUT - Timeout in API call", + amdsmi_wrapper.AMDSMI_STATUS_RETRY : "AMDSMI_STATUS_RETRY - Retry operation", + amdsmi_wrapper.AMDSMI_STATUS_NO_PERM : "AMDSMI_STATUS_NO_PERM - Permission Denied", + amdsmi_wrapper.AMDSMI_STATUS_INTERRUPT : "AMDSMI_STATUS_INTERRUPT - Interrupt ocurred during execution", + amdsmi_wrapper.AMDSMI_STATUS_IO : "AMDSMI_STATUS_IO - I/O Error", + amdsmi_wrapper.AMDSMI_STATUS_ADDRESS_FAULT : "AMDSMI_STATUS_ADDRESS_FAULT - Bad address", + amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR : "AMDSMI_STATUS_FILE_ERROR - Error opening file", + amdsmi_wrapper.AMDSMI_STATUS_OUT_OF_RESOURCES : "AMDSMI_STATUS_OUT_OF_RESOURCES - Not enough memory", + amdsmi_wrapper.AMDSMI_STATUS_INTERNAL_EXCEPTION : "AMDSMI_STATUS_INTERNAL_EXCEPTION - Internal error", + amdsmi_wrapper.AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS : "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS - Out of bounds", + amdsmi_wrapper.AMDSMI_STATUS_INIT_ERROR : "AMDSMI_STATUS_INIT_ERROR - Initialization error", + amdsmi_wrapper.AMDSMI_STATUS_REFCOUNT_OVERFLOW : "AMDSMI_STATUS_REFCOUNT_OVERFLOW - Internal reference counter exceeded INT32_MAX", + amdsmi_wrapper.AMDSMI_STATUS_BUSY : "AMDSMI_STATUS_BUSY - Device busy", + amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND : "AMDSMI_STATUS_NOT_FOUND - Device Not found", + amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT : "AMDSMI_STATUS_NOT_INIT - Device not initialized", + amdsmi_wrapper.AMDSMI_STATUS_NO_SLOT : "AMDSMI_STATUS_NO_SLOT - No more free slot", + amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED : "AMDSMI_STATUS_DRIVER_NOT_LOADED - Processor driver not loaded", + amdsmi_wrapper.AMDSMI_STATUS_NO_DATA : "AMDSMI_STATUS_NO_DATA - No data was found for given input", + amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE : "AMDSMI_STATUS_INSUFFICIENT_SIZE - Insufficient size for operation", + amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE : "AMDSMI_STATUS_UNEXPECTED_SIZE - unexpected size of data was read", + amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_DATA : "AMDSMI_STATUS_UNEXPECTED_DATA - The data read or provided was unexpected", + amdsmi_wrapper.AMDSMI_STATUS_NON_AMD_CPU : "AMDSMI_STATUS_NON_AMD_CPU - System has non-AMD CPU", + amdsmi_wrapper.AMDSMI_NO_ENERGY_DRV : "AMD_SMI_NO_ENERGY_DRV - Energy driver not found", + amdsmi_wrapper.AMDSMI_NO_MSR_DRV : "AMDSMI_NO_MSR_DRV - MSR driver not found", + amdsmi_wrapper.AMDSMI_NO_HSMP_DRV : "AMD_SMI_NO_HSMP_DRV - HSMP driver not found", + amdsmi_wrapper.AMDSMI_NO_HSMP_SUP : "AMD_SMI_NO_HSMP_SUP - HSMP not supported", + amdsmi_wrapper.AMDSMI_NO_HSMP_MSG_SUP : "AMD_SMI_NO_HSMP_MSG_SUP - HSMP message/feature not supported", + amdsmi_wrapper.AMDSMI_HSMP_TIMEOUT : "AMD_SMI_HSMP_TIMEOUT - HSMP message timeout", + amdsmi_wrapper.AMDSMI_NO_DRV : "AMDSMI_NO_DRV - No Energy and HSMP driver present", + amdsmi_wrapper.AMDSMI_FILE_NOT_FOUND : "AMDSMI_FILE_NOT_FOUND - File or directory not found", + amdsmi_wrapper.AMDSMI_ARG_PTR_NULL : "AMDSMI_ARG_PTR_NULL - Parsed argument is invalid", + amdsmi_wrapper.AMDSMI_STATUS_MAP_ERROR : "AMDSMI_STATUS_MAP_ERROR - The internal library error did not map to a status code", + amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR : "AMDSMI_STATUS_UNKNOWN_ERROR - An unknown error occurred" } self.err_info = switch.get(self.err_code, "AMDSMI_STATUS_UNKNOWN_ERROR - An unknown error occurred") @@ -112,12 +96,12 @@ class AmdSmiLibraryException(AmdSmiException): class AmdSmiRetryException(AmdSmiLibraryException): def __init__(self): - super().__init__(AmdSmiRetCode.RETRY) + super().__init__(amdsmi_wrapper.AMDSMI_STATUS_RETRY) class AmdSmiTimeoutException(AmdSmiLibraryException): def __init__(self): - super().__init__(AmdSmiRetCode.TIMEOUT) + super().__init__(amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT) class AmdSmiParameterException(AmdSmiException):