diff --git a/projects/amdsmi/amdsmi_cli/CMakeLists.txt b/projects/amdsmi/amdsmi_cli/CMakeLists.txt index 02ebc213f7..3068f7a75c 100644 --- a/projects/amdsmi/amdsmi_cli/CMakeLists.txt +++ b/projects/amdsmi/amdsmi_cli/CMakeLists.txt @@ -19,6 +19,7 @@ add_custom_command( ${PY_PACKAGE_DIR}/amdsmi_init.py ${PY_PACKAGE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/amdsmi_parser.py + ${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py ${PY_PACKAGE_DIR}/BDF.py DEPENDS amdsmi_cli COMMAND mkdir -p ${PY_PACKAGE_DIR}/ @@ -30,6 +31,7 @@ add_custom_command( COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_init.py ${PY_PACKAGE_DIR}/ COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/ COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_parser.py ${PY_PACKAGE_DIR}/ + COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_cli_exceptions.py ${PY_PACKAGE_DIR}/ COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/BDF.py ${PY_PACKAGE_DIR}/) # The CLI requires the python amdsmi wrapper to be installed @@ -44,6 +46,7 @@ add_custom_target( ${PY_PACKAGE_DIR}/amdsmi_init.py ${PY_PACKAGE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/amdsmi_parser.py + ${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py ${PY_PACKAGE_DIR}/BDF.py) install( diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py index 6e77bbaae3..83661c7dc3 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -26,6 +26,20 @@ import sys from amdsmi_commands import AMDSMICommands from amdsmi_parser import AMDSMIParser from amdsmi_logger import AMDSMILogger +import amdsmi_cli_exceptions +from amdsmi import amdsmi_interface + +def _print_error(e, destination): + if destination == 'stdout': + print(e) + else: + f = open(destination, "w") + f.write(e) + f.close() + print("Error occured. Result written to " + + str(destination) + " file") + + if __name__ == "__main__": # Set compatability mode based on which cli mapping user selects @@ -50,23 +64,28 @@ if __name__ == "__main__": amd_smi_commands.set_value, amd_smi_commands.reset, amd_smi_commands.rocm_smi) + try: + args = amd_smi_parser.parse_args(args=None if sys.argv[1:] else ['--help']) - args = amd_smi_parser.parse_args(args=None if sys.argv[1:] else ['--help']) + # Handle command modifiers before subcommand execution + if args.json: + amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.json.value + if args.csv: + amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value + if args.file: + amd_smi_commands.logger.destination = args.file + if args.loglevel: + logging_dict = {'DEBUG' : logging.DEBUG, + 'INFO' : logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL} + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel]) - # Handle command modifiers before subcommand execution - if args.json: - amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.json.value - if args.csv: - amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value - if args.file: - amd_smi_commands.logger.destination = args.file - if args.loglevel: - logging_dict = {'DEBUG' : logging.DEBUG, - 'INFO' : logging.INFO, - 'WARNING': logging.WARNING, - 'ERROR': logging.ERROR, - 'CRITICAL': logging.CRITICAL} - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel]) - - # Execute subcommands - args.func(args) + # Execute subcommands + args.func(args) + except amdsmi_cli_exceptions.AmdSmiException as e: + _print_error(str(e), amd_smi_commands.logger.destination) + except amdsmi_interface.AmdSmiLibraryException as e: + exc = amdsmi_cli_exceptions.AmdSmiAMDSMIErrorException(amd_smi_commands.logger.format, e.get_error_code()) + _print_error(str(exc), amd_smi_commands.logger.destination) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli_exceptions.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli_exceptions.py new file mode 100644 index 0000000000..d479673656 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli_exceptions.py @@ -0,0 +1,182 @@ + +import json +import sys + +AMDSMI_ERROR_MESSAGES = { + 0: "Sucess", + 1: "Invalid parameters", + 2: "Command not supported", + 3: "Command not yet implemented", + 4: "Failed load module", + 5: "Failed load symbole", + 6: "Drm error", + 7: "API call failed", + 8: "Timeout in API call", + 9: "Retry operation", + 10: "Permission Denied", + 11: "Interrupt ocurred during execution", + 12: "I/O Error", + 13: "Address fault", + 14: "Error opening file", + 15: "Not enough memory", + 16: "Internal error", + 17: "Out of bounds", + 18: "Initialization error", + 19: "Internal reference counter exceeded", + + 30: "Device busy", + 31: "Device Not found", + 32: "Device not initialized", + 33: "No more free slot", + + 40: "No data was found for given input", + 41: "Insufficient size for operation", + 42: "Unexpected size of data was read", + 43: "The data read or provided was unexpected", +} + +def _get_error_message(error_code): + if abs(error_code) in AMDSMI_ERROR_MESSAGES: + return AMDSMI_ERROR_MESSAGES[abs(error_code)] + return "Generic error" + +class AmdSmiException(Exception): + def __str__(self): + return self.message + + +class AmdSmiInvalidCommandException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -1 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "Command '{}' is invalid. Run '--help' for more info.".format(self.command) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "Command '{}' is invalid. Run '--help' for more info.,".format(self.command) + str(self.value) + else: + self.message = "Command '{}' is invalid. Run '--help' for more info. Error code: {}".format(self.command, self.value) + + + +class AmdSmiInvalidParameterException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -2 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "Parameter '{}' is invalid. Run '--help' for more info.".format(self.command) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "Parameter '{}' is invalid. Run '--help' for more info.,".format(self.command) + str(self.value) + else: + self.message = "Parameter '{}' is invalid. Run '--help' for more info. Error code: {}".format(self.command, self.value) + + +class AmdSmiDeviceNotFoundException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -3 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "GPU Device with GPU_INDEX '{}' cannot be found on the system.".format(self.command) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "GPU Device with GPU_INDEX '{}' cannot be found on the system.,".format(self.command) + str(self.value) + else: + self.message = "GPU Device with GPU_INDEX '{}' cannot be found on the system. Error code: {}".format(self.command, self.value) + + +class AmdSmiInvalidFilePathException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -4 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "Path '{}' cannot be found.".format(self.command) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "Path '{}' cannot be found.,".format(self.command) + str(self.value) + else: + self.message = "Path '{}' cannot be found. Error code: {}".format(self.command, self.value) + + +class AmdSmiInvalidParameterValueException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -5 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "Value '{}' is not of valid type or format. Run '--help' for more info.".format(self.command) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "Value '{}' is not of valid type or format. Run '--help' for more info.,".format(self.command) + str(self.value) + else: + self.message = "Value '{}' is not of valid type or format. Run '--help' for more info. Error code: {}".format(self.command, self.value) + + +class AmdSmiMissingParameterValueException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -6 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "Parameter '{}' requires a value. Run '--help' for more info.".format(self.command) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "Parameter '{}' requires a value. Run '--help' for more info.,".format(self.command) + str(self.value) + else: + self.message = "Parameter '{}' requires a value. Run '--help' for more info. Error code: {}".format(self.command, self.value) + +class AmdSmiParameterNotSupportedException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -8 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "Parameter '{}' is not supported on the system. Run '--help' for more info.".format(self.command) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "Parameter '{}' is not supported on the system. Run '--help' for more info.,".format(self.command) + str(self.value) + else: + self.message = "Parameter '{}' is not supported on the system. Run '--help' for more info. Error code: {}".format(self.command, self.value) + + +class AmdSmiUnknownErrorException(AmdSmiException): + def __init__(self, command, outputformat): + self.value = -100 + self.command = command + if outputformat == "json": + values = {} + values["error"] = "An unknown error has occurred. Run 'help' for more info." + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "An unknown error has occurred. Run 'help' for more info.," + str(self.value) + else: + self.message = "An unknown error has occurred. Run 'help' for more info. Error code: {}".format(self.value) + + +class AmdSmiAMDSMIErrorException(AmdSmiException): + def __init__(self, outputformat, error_code): + self.value = -1000 - abs(error_code) + self.smilibcode = error_code + + if outputformat == "json": + values = {} + values["error"] = "AMDSMI has returned error '{}' - '{}'".format(self.value, + AMDSMI_ERROR_MESSAGES[abs(self.smilibcode)]) + values["code"] = self.value + self.message = json.dumps(values) + elif outputformat == "csv": + self.message = "error,code\n" + "AMDSMI has returned error '{}' - '{}',".format(self.value, _get_error_message(self.smilibcode)) + str(self.value) + else: + self.message = "AMDSMI has returned error '{}' - '{}' Error code: {}".format(self.value, _get_error_message(self.smilibcode), self.value) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 257d956745..dd1c063803 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -51,6 +51,8 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: raise e self.stop = '' + self.all_arguments = False + def version(self, args): """Print Version String @@ -198,99 +200,111 @@ class AMDSMICommands(): handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.static) if handled_multiple_gpus: return # This function is recursive - args.gpu = device_handle # If all arguments are False, it means that no argument was passed and the entire static should be printed if not any([args.asic, args.bus, args.vbios, args.limit, args.driver, args.caps, args.ras, args.board]): - args.asic = args.bus = args.vbios = args.limit = args.driver = args.caps = args.ras = args.board = True + args.asic = args.bus = args.vbios = args.limit = args.driver = args.caps = args.ras = args.board = self.all_arguments = True values_dict = {} if args.asic: try: asic_info = amdsmi_interface.amdsmi_get_asic_info(args.gpu) + asic_info['family'] = hex(asic_info['family']) + asic_info['vendor_id'] = hex(asic_info['vendor_id']) + asic_info['device_id'] = hex(asic_info['device_id']) + asic_info['rev_id'] = hex(asic_info['rev_id']) + if asic_info['asic_serial'] != '': + asic_info['asic_serial'] = '0x' + asic_info['asic_serial'] + + values_dict['asic'] = asic_info except amdsmi_exception.AmdSmiLibraryException as e: - asic_info = e.get_error_info() - - asic_info['family'] = hex(asic_info['family']) - asic_info['vendor_id'] = hex(asic_info['vendor_id']) - asic_info['device_id'] = hex(asic_info['device_id']) - asic_info['rev_id'] = hex(asic_info['rev_id']) - if asic_info['asic_serial'] != '': - asic_info['asic_serial'] = '0x' + asic_info['asic_serial'] - - values_dict['asic'] = asic_info + values_dict['asic'] = e.get_error_info() + if not self.all_arguments: + raise e if args.bus: + bus_output_info = {} + try: bus_info = amdsmi_interface.amdsmi_get_pcie_link_caps(args.gpu) + + if self.logger.is_human_readable_format(): + unit ='MT/s' + bus_info['pcie_speed'] = f"{bus_info['pcie_speed']} {unit}" except amdsmi_exception.AmdSmiLibraryException as e: bus_info = e.get_error_info() - - if self.logger.is_human_readable_format(): - unit ='MT/s' - bus_info['pcie_speed'] = f"{bus_info['pcie_speed']} {unit}" - - bus_output_info = {} + if not self.all_arguments: + raise e try: bus_output_info['bdf'] = amdsmi_interface.amdsmi_get_device_bdf(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: bus_output_info['bdf'] = e.get_error_info() + if not self.all_arguments: + raise e bus_output_info.update(bus_info) - values_dict['bus'] = bus_output_info + if args.vbios: try: vbios_info = amdsmi_interface.amdsmi_get_vbios_info(args.gpu) + if self.logger.is_gpuvsmi_compatibility(): + vbios_info['version'] = vbios_info.pop('vbios_version_string') + vbios_info['build_date'] = vbios_info.pop('build_date') + vbios_info['part_number'] = vbios_info.pop('part_number') + vbios_info['vbios_version'] = vbios_info.pop('vbios_version') + + values_dict['vbios'] = vbios_info except amdsmi_exception.AmdSmiLibraryException as e: - vbios_info = e.get_error_info() - - - if self.logger.is_gpuvsmi_compatibility(): - vbios_info['version'] = vbios_info.pop('vbios_version_string') - vbios_info['build_date'] = vbios_info.pop('build_date') - vbios_info['part_number'] = vbios_info.pop('part_number') - vbios_info['vbios_version'] = vbios_info.pop('vbios_version') - - values_dict['vbios'] = vbios_info + values_dict['vbios'] = e.get_error_info() + if not self.all_arguments: + raise e if args.board: try: board_info = amdsmi_interface.amdsmi_get_board_info(args.gpu) + board_info['serial_number'] = hex(board_info['serial_number']) + board_info['product_serial'] = '0x' + board_info['product_serial'] + + if self.logger.is_gpuvsmi_compatibility(): + board_info['product_number'] = board_info.pop('product_serial') + board_info['product_name'] = board_info.pop('product_name') + + values_dict['board'] = board_info except amdsmi_exception.AmdSmiLibraryException as e: - board_info = e.get_error_info() - - board_info['serial_number'] = hex(board_info['serial_number']) - board_info['product_serial'] = '0x' + board_info['product_serial'] - board_info['product_name'] = board_info['product_name'].strip() - - if self.logger.is_gpuvsmi_compatibility(): - board_info['product_number'] = board_info.pop('product_serial') - - values_dict['board'] = board_info + values_dict['board'] = e.get_error_info() + if not self.all_arguments: + raise e if args.limit: try: power_limit = amdsmi_interface.amdsmi_get_power_measure(args.gpu)['power_limit'] except amdsmi_exception.AmdSmiLibraryException as e: power_limit = e.get_error_info() - + if not self.all_arguments: + raise e try: temp_edge_limit = amdsmi_interface.amdsmi_dev_get_temp_metric(args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) except amdsmi_exception.AmdSmiLibraryException as e: temp_edge_limit = e.get_error_info() + if not self.all_arguments: + raise e try: temp_junction_limit = amdsmi_interface.amdsmi_dev_get_temp_metric(args.gpu, amdsmi_interface.AmdSmiTemperatureType.JUNCTION, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) except amdsmi_exception.AmdSmiLibraryException as e: temp_junction_limit = e.get_error_info() + if not self.all_arguments: + raise e try: temp_vram_limit = amdsmi_interface.amdsmi_dev_get_temp_metric(args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) except amdsmi_exception.AmdSmiLibraryException as e: - temp_vram_limit = e.get_error_info() + temp_junction_limit = e.get_error_info() + if not self.all_arguments: + raise e if self.logger.is_human_readable_format(): unit = 'W' @@ -309,35 +323,40 @@ class AMDSMICommands(): values_dict['limit'] = limit_info if args.driver: - driver_info = {} try: + driver_info = {} driver_info['driver_version'] = amdsmi_interface.amdsmi_get_driver_version(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - driver_info['driver_version'] = e.get_error_info() - values_dict['driver'] = driver_info + values_dict['driver'] = driver_info + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['driver'] = e.get_error_info() + if not self.all_arguments: + raise e if args.ras: try: values_dict['ras'] = amdsmi_interface.amdsmi_get_ras_block_features_enabled(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: values_dict['ras'] = e.get_error_info() - + if not self.all_arguments: + raise e if args.caps: try: caps_info = amdsmi_interface.amdsmi_get_caps_info(args.gpu) + + if self.logger.is_gpuvsmi_compatibility(): + del caps_info['ras_supported'] + caps_info['gfx'] = caps_info.pop('gfx') + + if self.logger.is_human_readable_format(): + for capability_name, capability_value in caps_info.items(): + if isinstance(capability_value, list): + caps_info[capability_name] = f"{capability_value}" + + values_dict['caps'] = caps_info except amdsmi_exception.AmdSmiLibraryException as e: - caps_info = e.get_error_info() - - if self.logger.is_gpuvsmi_compatibility(): - del caps_info['ras_supported'] - caps_info['gfx'] = caps_info.pop('gfx') - - if self.logger.is_human_readable_format(): - for capability_name, capability_value in caps_info.items(): - if isinstance(capability_value, list): - caps_info[capability_name] = f"{capability_value}" - - values_dict['caps'] = caps_info + values_dict['caps'] = e.get_error_info() + if not self.all_arguments: + raise e # Store values in logger.output self.logger.store_output(args.gpu, 'values', values_dict) @@ -384,30 +403,30 @@ class AMDSMICommands(): if args.fw_list: try: fw_info = amdsmi_interface.amdsmi_get_fw_info(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - fw_info = e.get_error_info() - for fw_index, fw_entry in enumerate(fw_info['fw_list']): - # Change fw_name to fw_id - fw_entry['fw_id'] = fw_entry.pop('fw_name').name.strip('FW_ID_') - fw_entry['fw_version'] = fw_entry.pop('fw_version') - firmware_identifier = 'FW' + for fw_index, fw_entry in enumerate(fw_info['fw_list']): + # Change fw_name to fw_id + fw_entry['fw_id'] = fw_entry.pop('fw_name').name.strip('FW_ID_') + fw_entry['fw_version'] = fw_entry.pop('fw_version') + firmware_identifier = 'FW' + + if self.logger.is_gpuvsmi_compatibility(): + firmware_identifier = 'UCODE' + fw_entry['name'] = fw_entry.pop('fw_id') + fw_entry['version'] = fw_entry.pop('fw_version') + + # Add custom human readable formatting + if self.logger.is_human_readable_format(): + fw_info['fw_list'][fw_index] = {f'{firmware_identifier} {fw_index}': fw_entry} + else: + fw_info['fw_list'][fw_index] = fw_entry if self.logger.is_gpuvsmi_compatibility(): - firmware_identifier = 'UCODE' - fw_entry['name'] = fw_entry.pop('fw_id') - fw_entry['version'] = fw_entry.pop('fw_version') + fw_info['ucode_list'] = fw_info.pop('fw_list') - # Add custom human readable formatting - if self.logger.is_human_readable_format(): - fw_info['fw_list'][fw_index] = {f'{firmware_identifier} {fw_index}': fw_entry} - else: - fw_info['fw_list'][fw_index] = fw_entry - - if self.logger.is_gpuvsmi_compatibility(): - fw_info['ucode_list'] = fw_info.pop('fw_list') - - values_dict.update(fw_info) + values_dict.update(fw_info) + except amdsmi_exception.AmdSmiLibraryException as e: + raise e # Store values in logger.output self.logger.store_output(args.gpu, 'values', values_dict) @@ -471,6 +490,7 @@ class AMDSMICommands(): bad_page_info = "" bad_page_err_output = e.get_error_info() bad_page_error = True + raise e if isinstance(bad_page_info, str): pass @@ -582,6 +602,7 @@ class AMDSMICommands(): Returns: None: Print output via AMDSMILogger to destination """ + # Set args.* to passed in arguments if gpu: args.gpu = gpu @@ -655,158 +676,144 @@ class AMDSMICommands(): else: raise IndexError("args.gpu should not be an empty list") + # Check if any of the options have been set, if not then set them all to true if not any([args.usage, args.fb_usage, args.power, args.clock, args.temperature, args.ecc, args.pcie, args.voltage, args.fan, args.pcie_usage, args.voltage_curve, args.overdrive, args.mem_overdrive, args.perf_level, args.replay_count, args.xgmi_err, args.energy, args.mem_usage]): args.usage = args.fb_usage = args.power = args.clock = args.temperature = args.ecc = args.pcie = args.voltage = args.fan = \ args.pcie_usage = args.voltage_curve = args.overdrive = args.mem_overdrive = args.perf_level = \ - args.replay_count = args.xgmi_err = args.energy = args.mem_usage = True + args.replay_count = args.xgmi_err = args.energy = args.mem_usage = self.all_arguments = True + # Add timestamp and store values for specified arguments values_dict = {} if args.usage: try: engine_usage = amdsmi_interface.amdsmi_get_gpu_activity(args.gpu) + + if self.logger.is_gpuvsmi_compatibility(): + engine_usage['gfx_usage'] = engine_usage.pop('gfx_activity') + engine_usage['mem_usage'] = engine_usage.pop('umc_activity') + engine_usage['mm_usage_list'] = engine_usage.pop('mm_activity') + + if self.logger.is_human_readable_format(): + unit = '%' + for usage_name, usage_value in engine_usage.items(): + engine_usage[usage_name] = f"{usage_value} {unit}" + + values_dict['usage'] = engine_usage except amdsmi_exception.AmdSmiLibraryException as e: - engine_usage = e.get_error_info() - - if self.logger.is_gpuvsmi_compatibility(): - engine_usage['gfx_usage'] = engine_usage.pop('gfx_activity') - engine_usage['mem_usage'] = engine_usage.pop('umc_activity') - engine_usage['mm_usage_list'] = engine_usage.pop('mm_activity') - - if self.logger.is_human_readable_format(): - unit = '%' - for usage_name, usage_value in engine_usage.items(): - engine_usage[usage_name] = f"{usage_value} {unit}" - - values_dict['usage'] = engine_usage + raise e if args.fb_usage: try: vram_usage = amdsmi_interface.amdsmi_get_vram_usage(args.gpu) + + if self.logger.is_gpuvsmi_compatibility(): + vram_usage['fb_total'] = vram_usage.pop('vram_total') + vram_usage['fb_used'] = vram_usage.pop('vram_used') + + if self.logger.is_human_readable_format(): + unit = 'MB' + for vram_name, vram_value in vram_usage.items(): + vram_usage[vram_name] = f"{vram_value} {unit}" + + values_dict['fb_usage'] = vram_usage except amdsmi_exception.AmdSmiLibraryException as e: - vram_usage = e.get_error_info() - - if self.logger.is_gpuvsmi_compatibility(): - vram_usage['fb_total'] = vram_usage.pop('vram_total') - vram_usage['fb_used'] = vram_usage.pop('vram_used') - - if self.logger.is_human_readable_format(): - unit = 'MB' - for vram_name, vram_value in vram_usage.items(): - vram_usage[vram_name] = f"{vram_value} {unit}" - - values_dict['fb_usage'] = vram_usage + raise e if args.power: try: average_socket_power = amdsmi_interface.amdsmi_get_power_measure(args.gpu)['average_socket_power'] + + if self.logger.is_gpuvsmi_compatibility(): + pass + + if self.logger.is_human_readable_format(): + unit = 'W' + average_socket_power = f"{average_socket_power} {unit}" + + values_dict['power'] = average_socket_power except amdsmi_exception.AmdSmiLibraryException as e: - average_socket_power = e.get_error_info() - - if self.logger.is_gpuvsmi_compatibility(): - pass - - if self.logger.is_human_readable_format(): - unit = 'W' - average_socket_power = f"{average_socket_power} {unit}" - - values_dict['power'] = average_socket_power + raise e if args.clock: try: clock_gfx = amdsmi_interface.amdsmi_get_clock_measure(args.gpu, amdsmi_interface.AmdSmiClkType.GFX) - except amdsmi_exception.AmdSmiLibraryException as e: - clock_gfx = e.get_error_info() - - try: clock_mem = amdsmi_interface.amdsmi_get_clock_measure(args.gpu, amdsmi_interface.AmdSmiClkType.MEM) + + clocks = {'gfx': clock_gfx, + 'mem': clock_mem} + + if self.logger.is_human_readable_format(): + unit = 'MHz' + for clock_target, clock_metric_values in clocks.items(): + for clock_type, clock_value in clock_metric_values.items(): + clocks[clock_target][clock_type] = f"{clock_value} {unit}" + + values_dict['clock'] = clocks except amdsmi_exception.AmdSmiLibraryException as e: - clock_mem = e.get_error_info() - - clocks = {'gfx': clock_gfx, - 'mem': clock_mem} - - if self.logger.is_human_readable_format(): - unit = 'MHz' - for clock_target, clock_metric_values in clocks.items(): - for clock_type, clock_value in clock_metric_values.items(): - clocks[clock_target][clock_type] = f"{clock_value} {unit}" - - values_dict['clock'] = clocks + raise e if args.temperature: try: temperature_edge_current = amdsmi_interface.amdsmi_dev_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) - except amdsmi_exception.AmdSmiLibraryException as e: - temperature_edge_current = e.get_error_info() - - try: temperature_junction_current = amdsmi_interface.amdsmi_dev_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.JUNCTION, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) - except amdsmi_exception.AmdSmiLibraryException as e: - temperature_junction_current = e.get_error_info() - - try: temperature_vram_current = amdsmi_interface.amdsmi_dev_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + + temperatures = { 'edge': temperature_edge_current, + 'hotspot': temperature_junction_current, + 'mem': temperature_vram_current} + + if self.logger.is_gpuvsmi_compatibility(): + temperatures = { 'edge_temperature': temperature_edge_current, + 'hotspot_temperature': temperature_junction_current, + 'mem_temperature': temperature_vram_current} + + if self.logger.is_human_readable_format(): + unit = '\N{DEGREE SIGN}C' + for temperature_value in temperatures: + temperatures[temperature_value] = f"{temperatures[temperature_value]} {unit}" + + values_dict['temperature'] = temperatures except amdsmi_exception.AmdSmiLibraryException as e: - temperature_vram_current = e.get_error_info() - - temperatures = { 'edge': temperature_edge_current, - 'hotspot': temperature_junction_current, - 'mem': temperature_vram_current} - - if self.logger.is_gpuvsmi_compatibility(): - temperatures = { 'edge_temperature': temperature_edge_current, - 'hotspot_temperature': temperature_junction_current, - 'mem_temperature': temperature_vram_current} - - if self.logger.is_human_readable_format(): - unit = '\N{DEGREE SIGN}C' - for temperature_value in temperatures: - temperatures[temperature_value] = f"{temperatures[temperature_value]} {unit}" - - values_dict['temperature'] = temperatures + raise e if args.ecc: try: values_dict['ecc'] = amdsmi_interface.amdsmi_get_ecc_error_count(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: values_dict['ecc'] = e.get_error_info() - + raise e if args.pcie: try: pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_caps(args.gpu) + + if self.logger.is_human_readable_format(): + unit ='MT/s' + pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}" + + if self.logger.is_gpuvsmi_compatibility(): + pcie_link_status['current_width'] = pcie_link_status.pop('pcie_lanes') + pcie_link_status['current_speed'] = pcie_link_status.pop('pcie_speed') + + values_dict['pcie'] = pcie_link_status except amdsmi_exception.AmdSmiLibraryException as e: - pcie_link_status = e.get_error_info() - - if self.logger.is_human_readable_format(): - unit ='MT/s' - pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}" - - if self.logger.is_gpuvsmi_compatibility(): - pcie_link_status['current_width'] = pcie_link_status.pop('pcie_lanes') - pcie_link_status['current_speed'] = pcie_link_status.pop('pcie_speed') - - values_dict['pcie'] = pcie_link_status + raise e if args.voltage: try: volt_metric = amdsmi_interface.amdsmi_dev_get_volt_metric( args.gpu, amdsmi_interface.AmdSmiVoltageType.VDDGFX, amdsmi_interface.AmdSmiVoltageMetric.CURRENT) + + if self.logger.is_human_readable_format(): + unit = 'mV' + volt_metric = f"{volt_metric} {unit}" + + values_dict['voltage'] = volt_metric except amdsmi_exception.AmdSmiLibraryException as e: - volt_metric = e.get_error_info() - - if self.logger.is_human_readable_format(): - unit = 'mV' - volt_metric = f"{volt_metric} {unit}" - - values_dict['voltage'] = volt_metric + raise e if args.fan: try: fan_speed = amdsmi_interface.amdsmi_dev_get_fan_speed(args.gpu, 0) - except amdsmi_exception.AmdSmiLibraryException as e: - fan_speed = e.get_error_info() - - try: fan_max = amdsmi_interface.amdsmi_dev_get_fan_speed_max(args.gpu, 0) if isinstance(fan_speed, int) and fan_max > 0: fan_percent = round((float(fan_speed) / float(fan_max)) * 100, 2) @@ -815,42 +822,30 @@ class AMDSMICommands(): fan_percent = f"{fan_percent} {unit}" else: fan_percent = 'Unable to detect fan speed' - except amdsmi_exception.AmdSmiLibraryException as e: - fan_max = e.get_error_info() - fan_percent = 'Unable to detect fan speed' - try: fan_rpm = amdsmi_interface.amdsmi_dev_get_fan_rpms(args.gpu, 0) - except amdsmi_exception.AmdSmiLibraryException as e: - fan_rpm = e.get_error_info() - values_dict['fan'] = {'speed': fan_speed, - 'max' : fan_max, - 'rpm' : fan_rpm, - 'usage' : fan_percent} + values_dict['fan'] = {'speed': fan_speed, + 'max' : fan_max, + 'rpm' : fan_rpm, + 'usage' : fan_percent} + except amdsmi_exception.AmdSmiLibraryException as e: + raise e if args.pcie_usage: try: pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_status(args.gpu) - pcie_link_status_call = True + + if self.logger.is_human_readable_format(): + unit ='MT/s' + pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}" + + values_dict['pcie_usage'] = pcie_link_status except amdsmi_exception.AmdSmiLibraryException as e: - pcie_link_status = e.get_error_info() - pcie_link_status_call = False - - if self.logger.is_human_readable_format() and pcie_link_status_call: - unit ='MT/s' - pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}" - - values_dict['pcie_usage'] = pcie_link_status + raise e if args.voltage_curve: try: od_volt = amdsmi_interface.amdsmi_dev_get_od_volt_info(args.gpu) - voltage_curve_error = False - except amdsmi_exception.AmdSmiLibraryException as e: - od_volt = None - values_dict["voltage_curve"] = e.get_error_info() - voltage_curve_error = True - if not voltage_curve_error: voltage_point_dict = {} for point in range(3): @@ -863,100 +858,79 @@ class AMDSMICommands(): voltage_point_dict[f'voltage_point_{point}'] = f"{frequency}Mhz {voltage}mV" values_dict['voltage_curve'] = voltage_point_dict + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['voltage_curve'] = e.get_error_info() + if not self.all_arguments: + raise e + if args.overdrive: try: overdrive_level = amdsmi_interface.amdsmi_dev_get_overdrive_level(args.gpu) + if self.logger.is_human_readable_format(): unit = '%' overdrive_level = f"{overdrive_level} {unit}" - except amdsmi_exception.AmdSmiLibraryException as e: - overdrive_level = e.get_error_info() - values_dict['overdrive'] = overdrive_level + values_dict['overdrive'] = overdrive_level + except amdsmi_exception.AmdSmiLibraryException as e: + raise e if args.mem_overdrive: values_dict['mem_overdrive'] = amdsmi_interface.AmdSmiRetCode.NOT_IMPLEMENTED if args.perf_level: try: - values_dict['perf_level'] = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) + perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) + values_dict['perf_level'] = perf_level except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['perf_level'] = e.get_error_info() - + raise e if args.replay_count: try: - values_dict['replay_count'] = amdsmi_interface.amdsmi_dev_get_pci_replay_counter(args.gpu) + pci_replay_counter = amdsmi_interface.amdsmi_dev_get_pci_replay_counter(args.gpu) + values_dict['replay_count'] = pci_replay_counter except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['replay_count'] = e.get_error_info() - + raise e if args.xgmi_err: try: values_dict['xgmi_err'] = amdsmi_interface.amdsmi_dev_xgmi_error_status(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['xgmi_err'] = e.get_error_info() - + except amdsmi_interface.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.AmdSmiRetCode.ERR_NOT_SUPPORTED: + values_dict['xgmi_err'] = 'N/A' + else: + raise e if args.energy: try: energy = amdsmi_interface.amdsmi_get_power_measure(args.gpu)['energy_accumulator'] + if self.logger.is_human_readable_format(): unit = 'J' energy = f"{energy} {unit}" - except amdsmi_exception.AmdSmiLibraryException as e: - energy = e.get_error_info() - values_dict['energy'] = energy + values_dict['energy'] = energy + except amdsmi_exception.AmdSmiLibraryException as e: + raise e if args.mem_usage: - unit = 'MB' - memory_total = {} - try: - total_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) - memory_total['vram'] = total_vram // (1024*1024) + memory_total_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) + memory_total_vis_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) + memory_total_gtt = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) + + memory_total = {} + # Convert mem_usage to megabytes + memory_total['vram'] = memory_total_vram // (1024*1024) + memory_total['vis_vram'] = memory_total_vis_vram // (1024*1024) + memory_total['gtt'] = memory_total_gtt // (1024*1024) + if self.logger.is_human_readable_format(): + unit = 'MB' + energy = f"{energy} {unit}" memory_total['vram'] = f"{memory_total['vram']} {unit}" - except amdsmi_exception.AmdSmiLibraryException as e: - memory_total['vram'] = e.get_error_info() - - try: - total_vis_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) - memory_total['vis_vram'] = total_vis_vram // (1024*1024) - if self.logger.is_human_readable_format(): memory_total['vis_vram'] = f"{memory_total['vis_vram']} {unit}" - except amdsmi_exception.AmdSmiLibraryException as e: - memory_total['vis_vram'] = e.get_error_info() - - try: - total_gtt = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) - memory_total['gtt'] = total_gtt // (1024*1024) - if self.logger.is_human_readable_format(): memory_total['gtt'] = f"{memory_total['gtt']} {unit}" - except amdsmi_exception.AmdSmiLibraryException as e: - memory_total['gtt'] = e.get_error_info() - try: - total_used_vram = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) - memory_total['used_vram'] = total_used_vram // (1024*1024) - if self.logger.is_human_readable_format(): - memory_total['used_vram'] = f"{memory_total['used_vram']} {unit}" - except amdsmi_exception.AmdSmiLibraryException as e: - memory_total['used_vram'] = e.get_error_info() - try: - total_used_vis_vram = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) - memory_total['used_vis_vram'] = total_used_vis_vram // (1024*1024) - if self.logger.is_human_readable_format(): - memory_total['used_vis_vram'] = f"{memory_total['used_vis_vram']} {unit}" + values_dict['mem_usage'] = memory_total except amdsmi_exception.AmdSmiLibraryException as e: - memory_total['used_vis_vram'] = e.get_error_info() - - try: - total_used_gtt = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) - memory_total['used_gtt'] = total_used_gtt // (1024*1024) - if self.logger.is_human_readable_format(): - memory_total['used_gtt'] = f"{memory_total['used_gtt']} {unit}" - except amdsmi_exception.AmdSmiLibraryException as e: - memory_total['used_gtt'] = e.get_error_info() - - values_dict['mem_usage'] = memory_total + raise e # Store values in logger.output self.logger.store_output(args.gpu, 'values', values_dict) @@ -1190,11 +1164,6 @@ class AMDSMICommands(): args.gpu = self.device_handles # Handle all args being false - - # If all arguments are False, it means that no argument was passed and the entire topology should be printed - # if not any([args.asic, args.bus, args.vbios, args.limit, args.driver, args.caps, args.ras, args.board]): - # args.asic = args.bus = args.vbios = args.limit = args.driver = args.caps = args.ras = args.board = True - if not any([args.access, args.weight, args.hops, args.type, args.numa, args.numa_bw]): args.access = args.weight = args.hops = args.type = args.numa = args.numa_bw = True diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 26ea86070e..5e87cfa2c2 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -26,9 +26,12 @@ import errno import os import time from pathlib import Path +import sys from _version import __version__ from amdsmi_helpers import AMDSMIHelpers +import amdsmi_cli_exceptions +from BDF import BDF class AMDSMIParser(argparse.ArgumentParser): @@ -80,8 +83,13 @@ class AMDSMIParser(argparse.ArgumentParser): if int_value.isdigit(): # Is digit works only on positive numbers return int(int_value) else: - raise argparse.ArgumentTypeError( - f"invalid input:{int_value} integer provided must be positive") + args = sys.argv[1:] + outputformat = "human" + if "--json" in args or "--j" in args: + outputformat = "json" + elif "--csv" in args or "-c" in args: + outputformat = "csv" + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat) def _check_output_file_path(self): @@ -99,8 +107,13 @@ class AMDSMIParser(argparse.ArgumentParser): if path.parent.is_dir(): path.touch() else: - raise argparse.ArgumentTypeError( - f"Invalid path:{path} Could not find parent directory of given path") + args = sys.argv[1:] + outputformat = "human" + if "--json" in args or "--j" in args: + outputformat = "json" + elif "--csv" in args or "-c" in args: + outputformat = "csv" + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, outputformat) if path.is_dir(): path = path / f"{int(time.time())}-amdsmi-output.txt" @@ -109,8 +122,13 @@ class AMDSMIParser(argparse.ArgumentParser): elif path.is_file(): setattr(args, self.dest, path) else: - raise argparse.ArgumentTypeError( - f"Invalid path:{path} Could not determine if value given is a valid path") + args = sys.argv[1:] + outputformat = "human" + if "--json" in args or "--j" in args: + outputformat = "json" + elif "--csv" in args or "-c" in args: + outputformat = "csv" + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, outputformat) return CheckOutputFilePath @@ -142,26 +160,24 @@ class AMDSMIParser(argparse.ArgumentParser): def _check_watch_selected(self): - """ Argument action validator: - Validate that the -w/--watch argument was selected + """ Validate that the -w/--watch argument was selected This is because -W/--watch_time and -i/--iterations are dependent on watch """ - class _WatchSelectedAction(argparse.Action): + class WatchSelectedAction(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): if args.watch is None: - raise argparse.ArgumentError(self, - f"Invalid argument: '{self.dest}' needs to be paired with -w/--watch") - setattr(args, self.dest, values) - return _WatchSelectedAction - + raise argparse.ArgumentError(self, f"invalid argument: '{self.dest}' needs to be paired with -w/--watch") + else: + setattr(args, self.dest, values) + return WatchSelectedAction def _gpu_select(self, gpu_choices): - """ Argument action validator: - Custom argparse action to return the device handle(s) for the gpu(s) selected + """ Custom argparse action to return the device handle(s) for the gpu(s) selected This will set the destination (args.gpu) to a list of 1 or more device handles If 1 or more device handles are not found then raise an ArgumentError for the first invalid gpu seen """ + amdsmi_helpers = self.amdsmi_helpers class _GPUSelectAction(argparse.Action): # Checks the values @@ -172,7 +188,17 @@ class AMDSMIParser(argparse.ArgumentParser): setattr(args, self.dest, selected_device_handles) else: invalid_selection = selected_device_handles - raise argparse.ArgumentError(self, f"invalid choice: '{invalid_selection}' (see available choices with -h)") + args = sys.argv[1:] + outputformat = "human" + if "--json" in args or "--j" in args: + outputformat = "json" + elif "--csv" in args or "-c" in args: + outputformat = "csv" + if invalid_selection == '': + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", outputformat) + else: + raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(invalid_selection, outputformat) + return _GPUSelectAction @@ -679,6 +705,7 @@ class AMDSMIParser(argparse.ArgumentParser): class _ValidateFanSpeed(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): + # Convert percentage to fan level if isinstance(values, str): try: @@ -804,3 +831,22 @@ class AMDSMIParser(argparse.ArgumentParser): rocm_smi_parser.add_argument('-p', '--showproductname', action='store_true', required=False, help=showproductname_help) rocm_smi_parser.add_argument('-v', '--showclkvolt', action='store_true', required=False, help=showclkvolt_help) rocm_smi_parser.add_argument('-f', '--showclkfrq', action='store_true', required=False, help=showclkfrq_help) + + def error(self, message): + args = sys.argv[1:] + outputformat = "human" + if "--json" in args or "--j" in args: + outputformat = "json" + elif "--csv" in args or "-c" in args: + outputformat = "csv" + if "argument : invalid choice: " in message: + l = len("argument : invalid choice: ") + 1 + message = message[l:] + message = message.split("'")[0] + raise amdsmi_cli_exceptions.AmdSmiInvalidCommandException(message, outputformat) + elif "unrecognized arguments: " in message: + l = len("unrecognized arguments: ") + message = message[l:] + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(message, outputformat) + else: + print(message)