diff --git a/amdsmi_cli/_version.py b/amdsmi_cli/_version.py index a0235ce508..27fdca497c 100644 --- a/amdsmi_cli/_version.py +++ b/amdsmi_cli/_version.py @@ -1 +1 @@ -__version__ = "0.0.2" \ No newline at end of file +__version__ = "0.0.3" diff --git a/amdsmi_cli/amdsmi_cli_exceptions.py b/amdsmi_cli/amdsmi_cli_exceptions.py index d479673656..c43f37379a 100644 --- a/amdsmi_cli/amdsmi_cli_exceptions.py +++ b/amdsmi_cli/amdsmi_cli_exceptions.py @@ -1,6 +1,26 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# import json -import sys AMDSMI_ERROR_MESSAGES = { 0: "Sucess", @@ -23,12 +43,12 @@ AMDSMI_ERROR_MESSAGES = { 17: "Out of bounds", 18: "Initialization error", 19: "Internal reference counter exceeded", - + # Reserved for future error messages 30: "Device busy", 31: "Device Not found", 32: "Device not initialized", 33: "No more free slot", - + # Reserved for future error messages 40: "No data was found for given input", 41: "Insufficient size for operation", 42: "Unexpected size of data was read", @@ -41,142 +61,152 @@ def _get_error_message(error_code): return "Generic error" class AmdSmiException(Exception): + def __init__(self): + self.json_message = {} + self.csv_message = '' + self.stdout_message = '' + self.message = '' + self.output_format = '' + def __str__(self): + # Return message according to the current output format + if self.output_format == 'json': + self.message = json.dumps(self.json_message) + elif self.output_format == 'csv': + self.message = self.csv_message + else: + self.message = self.stdout_message + return self.message class AmdSmiInvalidCommandException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -1 self.command = command - if outputformat == "json": - values = {} - values["error"] = "Command '{}' is invalid. Run '--help' for more info.".format(self.command) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "Command '{}' is invalid. Run '--help' for more info.,".format(self.command) + str(self.value) - else: - self.message = "Command '{}' is invalid. Run '--help' for more info. Error code: {}".format(self.command, self.value) + self.output_format = outputformat + common_message = f"Command '{self.command}' is invalid. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" class AmdSmiInvalidParameterException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -2 self.command = command - if outputformat == "json": - values = {} - values["error"] = "Parameter '{}' is invalid. Run '--help' for more info.".format(self.command) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "Parameter '{}' is invalid. Run '--help' for more info.,".format(self.command) + str(self.value) - else: - self.message = "Parameter '{}' is invalid. Run '--help' for more info. Error code: {}".format(self.command, self.value) + self.output_format = outputformat + + common_message = f"Parameter '{self.command}' is invalid. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" class AmdSmiDeviceNotFoundException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -3 self.command = command - if outputformat == "json": - values = {} - values["error"] = "GPU Device with GPU_INDEX '{}' cannot be found on the system.".format(self.command) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "GPU Device with GPU_INDEX '{}' cannot be found on the system.,".format(self.command) + str(self.value) - else: - self.message = "GPU Device with GPU_INDEX '{}' cannot be found on the system. Error code: {}".format(self.command, self.value) + self.output_format = outputformat + common_message = f"GPU Device with GPU_INDEX '{self.command}' cannot be found on the system." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" class AmdSmiInvalidFilePathException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -4 self.command = command - if outputformat == "json": - values = {} - values["error"] = "Path '{}' cannot be found.".format(self.command) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "Path '{}' cannot be found.,".format(self.command) + str(self.value) - else: - self.message = "Path '{}' cannot be found. Error code: {}".format(self.command, self.value) + self.output_format = outputformat + + common_message = f"Path '{self.command}' cannot be found." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" class AmdSmiInvalidParameterValueException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -5 self.command = command - if outputformat == "json": - values = {} - values["error"] = "Value '{}' is not of valid type or format. Run '--help' for more info.".format(self.command) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "Value '{}' is not of valid type or format. Run '--help' for more info.,".format(self.command) + str(self.value) - else: - self.message = "Value '{}' is not of valid type or format. Run '--help' for more info. Error code: {}".format(self.command, self.value) + self.output_format = outputformat + + common_message = f"Value '{self.command}' is not of valid type or format. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" class AmdSmiMissingParameterValueException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -6 self.command = command - if outputformat == "json": - values = {} - values["error"] = "Parameter '{}' requires a value. Run '--help' for more info.".format(self.command) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "Parameter '{}' requires a value. Run '--help' for more info.,".format(self.command) + str(self.value) - else: - self.message = "Parameter '{}' requires a value. Run '--help' for more info. Error code: {}".format(self.command, self.value) + self.output_format = outputformat + + common_message = f"Parameter '{self.command}' requires a value. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + class AmdSmiParameterNotSupportedException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -8 self.command = command - if outputformat == "json": - values = {} - values["error"] = "Parameter '{}' is not supported on the system. Run '--help' for more info.".format(self.command) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "Parameter '{}' is not supported on the system. Run '--help' for more info.,".format(self.command) + str(self.value) - else: - self.message = "Parameter '{}' is not supported on the system. Run '--help' for more info. Error code: {}".format(self.command, self.value) + self.output_format = outputformat + common_message = f"Parameter '{self.command}' is not supported on the system. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" class AmdSmiUnknownErrorException(AmdSmiException): def __init__(self, command, outputformat): + super().__init__() self.value = -100 self.command = command - if outputformat == "json": - values = {} - values["error"] = "An unknown error has occurred. Run 'help' for more info." - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "An unknown error has occurred. Run 'help' for more info.," + str(self.value) - else: - self.message = "An unknown error has occurred. Run 'help' for more info. Error code: {}".format(self.value) + self.output_format = outputformat + common_message = "An unknown error has occurred. Run 'help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" class AmdSmiAMDSMIErrorException(AmdSmiException): def __init__(self, outputformat, error_code): + super().__init__() self.value = -1000 - abs(error_code) self.smilibcode = error_code + self.output_format = outputformat - if outputformat == "json": - values = {} - values["error"] = "AMDSMI has returned error '{}' - '{}'".format(self.value, - AMDSMI_ERROR_MESSAGES[abs(self.smilibcode)]) - values["code"] = self.value - self.message = json.dumps(values) - elif outputformat == "csv": - self.message = "error,code\n" + "AMDSMI has returned error '{}' - '{}',".format(self.value, _get_error_message(self.smilibcode)) + str(self.value) - else: - self.message = "AMDSMI has returned error '{}' - '{}' Error code: {}".format(self.value, _get_error_message(self.smilibcode), self.value) + common_message = f"AMDSMI has returned error '{self.value}' - '{AMDSMI_ERROR_MESSAGES[abs(self.smilibcode)]}'" + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index dd1c063803..9215c21213 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -236,6 +236,7 @@ class AMDSMICommands(): bus_info = e.get_error_info() if not self.all_arguments: raise e + try: bus_output_info['bdf'] = amdsmi_interface.amdsmi_get_device_bdf(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: @@ -245,7 +246,6 @@ class AMDSMICommands(): bus_output_info.update(bus_info) values_dict['bus'] = bus_output_info - if args.vbios: try: vbios_info = amdsmi_interface.amdsmi_get_vbios_info(args.gpu) @@ -282,6 +282,7 @@ class AMDSMICommands(): power_limit = e.get_error_info() if not self.all_arguments: raise e + try: temp_edge_limit = amdsmi_interface.amdsmi_dev_get_temp_metric(args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) @@ -343,10 +344,6 @@ class AMDSMICommands(): try: caps_info = amdsmi_interface.amdsmi_get_caps_info(args.gpu) - if self.logger.is_gpuvsmi_compatibility(): - del caps_info['ras_supported'] - caps_info['gfx'] = caps_info.pop('gfx') - if self.logger.is_human_readable_format(): for capability_name, capability_value in caps_info.items(): if isinstance(capability_value, list): @@ -565,8 +562,8 @@ class AMDSMICommands(): def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, fb_usage=None, power=None, clock=None, temperature=None, ecc=None, pcie=None, voltage=None, fan=None, - pcie_usage=None, voltage_curve=None, overdrive=None, mem_overdrive=None, - perf_level=None, replay_count=None, xgmi_err=None, energy=None, mem_usage=None): + voltage_curve=None, overdrive=None, mem_overdrive=None, perf_level=None, + replay_count=None, xgmi_err=None, energy=None, mem_usage=None): """Get Metric information for target gpu Args: @@ -586,7 +583,6 @@ class AMDSMICommands(): pcie (bool, optional): Value override for args.pcie. Defaults to None. voltage (bool, optional): Value override for args.voltage. Defaults to None. fan (bool, optional): Value override for args.fan. Defaults to None. - pcie_usage (bool, optional): Value override for args.pcie_usage. Defaults to None. voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. overdrive (bool, optional): Value override for args.overdrive. Defaults to None. mem_overdrive (bool, optional): Value override for args.mem_overdrive. Defaults to None. @@ -630,8 +626,6 @@ class AMDSMICommands(): args.voltage = voltage if fan: args.fan = fan - if pcie_usage: - args.pcie_usage = pcie_usage if voltage_curve: args.voltage_curve = voltage_curve if overdrive: @@ -676,15 +670,13 @@ class AMDSMICommands(): else: raise IndexError("args.gpu should not be an empty list") - # Check if any of the options have been set, if not then set them all to true - if not any([args.usage, args.fb_usage, args.power, args.clock, args.temperature, args.ecc, args.pcie, args.voltage, args.fan, - args.pcie_usage, args.voltage_curve, args.overdrive, args.mem_overdrive, args.perf_level, - args.replay_count, args.xgmi_err, args.energy, args.mem_usage]): + if not any([args.usage, args.fb_usage, args.power, args.clock, args.temperature, args.ecc, args.pcie, args.voltage, + args.fan, args.voltage_curve, args.overdrive, args.mem_overdrive, args.perf_level, args.replay_count, + args.xgmi_err, args.energy, args.mem_usage]): args.usage = args.fb_usage = args.power = args.clock = args.temperature = args.ecc = args.pcie = args.voltage = args.fan = \ - args.pcie_usage = args.voltage_curve = args.overdrive = args.mem_overdrive = args.perf_level = \ - args.replay_count = args.xgmi_err = args.energy = args.mem_usage = self.all_arguments = True - + args.voltage_curve = args.overdrive = args.mem_overdrive = args.perf_level = args.replay_count = args.xgmi_err = \ + args.energy = args.mem_usage = self.all_arguments = True # Add timestamp and store values for specified arguments values_dict = {} @@ -704,7 +696,9 @@ class AMDSMICommands(): values_dict['usage'] = engine_usage except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['usage'] = e.get_error_info() + if not self.all_arguments: + raise e if args.fb_usage: try: vram_usage = amdsmi_interface.amdsmi_get_vram_usage(args.gpu) @@ -720,21 +714,49 @@ class AMDSMICommands(): values_dict['fb_usage'] = vram_usage except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['fb_usage'] = e.get_error_info() + if not self.all_arguments: + raise e if args.power: + power_dict = {} try: - average_socket_power = amdsmi_interface.amdsmi_get_power_measure(args.gpu)['average_socket_power'] - - if self.logger.is_gpuvsmi_compatibility(): - pass + power_measure = amdsmi_interface.amdsmi_get_power_measure(args.gpu) + power_dict = {'average_socket_power': power_measure['average_socket_power'], + 'voltage_gfx': power_measure['voltage_gfx'], + 'voltage_soc': amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info, + 'voltage_mem': amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info} if self.logger.is_human_readable_format(): - unit = 'W' - average_socket_power = f"{average_socket_power} {unit}" + power_dict['average_socket_power'] = f"{power_dict['average_socket_power']} W" + power_dict['voltage_gfx'] = f"{power_dict['voltage_gfx']} mV" + power_dict['voltage_soc'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + power_dict['voltage_mem'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info - values_dict['power'] = average_socket_power except amdsmi_exception.AmdSmiLibraryException as e: - raise e + power_dict = {'average_socket_power': e.get_error_info(), + 'voltage_gfx': e.get_error_info(), + 'voltage_soc': e.get_error_info(), + 'voltage_mem': e.get_error_info()} + + if not self.all_arguments: + raise e + + if self.logger.is_gpuvsmi_compatibility(): + power_dict['current_power'] = power_dict.pop('average_socket_power') + power_dict['current_voltage'] = power_dict.pop('voltage_gfx') + power_dict['current_voltage_soc'] = power_dict.pop('voltage_soc') + power_dict['current_voltage_mem'] = power_dict.pop('voltage_mem') + + try: + power_dict['current_fan_rpm'] = amdsmi_interface.amdsmi_dev_get_fan_rpms(args.gpu, 0) + if self.logger.is_human_readable_format(): + power_dict['current_fan_rpm'] = f"{power_dict['current_fan_rpm']} RPM" + except amdsmi_exception.AmdSmiLibraryException as e: + power_dict['current_fan_rpm'] = e.get_error_info() + if not self.all_arguments: + raise e + + values_dict['power'] = power_dict if args.clock: try: clock_gfx = amdsmi_interface.amdsmi_get_clock_measure(args.gpu, amdsmi_interface.AmdSmiClkType.GFX) @@ -751,7 +773,9 @@ class AMDSMICommands(): values_dict['clock'] = clocks except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['clock'] = e.get_error_info() + if not self.all_arguments: + raise e if args.temperature: try: temperature_edge_current = amdsmi_interface.amdsmi_dev_get_temp_metric( @@ -761,29 +785,44 @@ class AMDSMICommands(): temperature_vram_current = amdsmi_interface.amdsmi_dev_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) - temperatures = { 'edge': temperature_edge_current, + temperatures = {'edge': temperature_edge_current, 'hotspot': temperature_junction_current, 'mem': temperature_vram_current} if self.logger.is_gpuvsmi_compatibility(): - temperatures = { 'edge_temperature': temperature_edge_current, + temperatures = {'edge_temperature': temperature_edge_current, 'hotspot_temperature': temperature_junction_current, 'mem_temperature': temperature_vram_current} if self.logger.is_human_readable_format(): unit = '\N{DEGREE SIGN}C' + if self.logger.is_gpuvsmi_compatibility(): + unit = 'C' for temperature_value in temperatures: temperatures[temperature_value] = f"{temperatures[temperature_value]} {unit}" values_dict['temperature'] = temperatures except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['temperature'] = e.get_error_info() + if not self.all_arguments: + raise e if args.ecc: + ecc_dict = {} try: - values_dict['ecc'] = amdsmi_interface.amdsmi_get_ecc_error_count(args.gpu) + ras_states = amdsmi_interface.amdsmi_get_ras_block_features_enabled(args.gpu) + for state in ras_states: + if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED: + gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']] + ecc_count = amdsmi_interface.amdsmi_get_ecc_error_count(args.gpu, gpu_block) + ecc_dict[state['block']] = {'correctable' : ecc_count['correctable_count'], + 'uncorrectable': ecc_count['uncorrectable_count']} + if ecc_dict == {}: + ecc_dict = 'No RAS Blocks Enabled' + values_dict['ecc'] = ecc_dict except amdsmi_exception.AmdSmiLibraryException as e: values_dict['ecc'] = e.get_error_info() - raise e + if not self.all_arguments: + raise e if args.pcie: try: pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_caps(args.gpu) @@ -798,7 +837,9 @@ class AMDSMICommands(): values_dict['pcie'] = pcie_link_status except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['pcie'] = e.get_error_info() + if not self.all_arguments: + raise e if args.voltage: try: volt_metric = amdsmi_interface.amdsmi_dev_get_volt_metric( @@ -810,38 +851,39 @@ class AMDSMICommands(): values_dict['voltage'] = volt_metric except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['voltage'] = e.get_error_info() + if not self.all_arguments: + raise e if args.fan: try: fan_speed = amdsmi_interface.amdsmi_dev_get_fan_speed(args.gpu, 0) + fan_speed_error = False + except amdsmi_exception.AmdSmiLibraryException as e: + fan_speed = e.get_error_info() + fan_speed_error = True + + try: fan_max = amdsmi_interface.amdsmi_dev_get_fan_speed_max(args.gpu, 0) - if isinstance(fan_speed, int) and fan_max > 0: + if not fan_speed_error and fan_max > 0: fan_percent = round((float(fan_speed) / float(fan_max)) * 100, 2) if self.logger.is_human_readable_format(): unit = '%' fan_percent = f"{fan_percent} {unit}" else: fan_percent = 'Unable to detect fan speed' - - fan_rpm = amdsmi_interface.amdsmi_dev_get_fan_rpms(args.gpu, 0) - - values_dict['fan'] = {'speed': fan_speed, - 'max' : fan_max, - 'rpm' : fan_rpm, - 'usage' : fan_percent} except amdsmi_exception.AmdSmiLibraryException as e: - raise e - if args.pcie_usage: + fan_max = e.get_error_info() + fan_percent = 'Unable to detect fan speed' + try: - pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_status(args.gpu) - - if self.logger.is_human_readable_format(): - unit ='MT/s' - pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}" - - values_dict['pcie_usage'] = pcie_link_status + fan_rpm = amdsmi_interface.amdsmi_dev_get_fan_rpms(args.gpu, 0) except amdsmi_exception.AmdSmiLibraryException as e: - raise e + fan_rpm = e.get_error_info() + + values_dict['fan'] = {'speed': fan_speed, + 'max' : fan_max, + 'rpm' : fan_rpm, + 'usage' : fan_percent} if args.voltage_curve: try: od_volt = amdsmi_interface.amdsmi_dev_get_od_volt_info(args.gpu) @@ -862,7 +904,6 @@ class AMDSMICommands(): values_dict['voltage_curve'] = e.get_error_info() if not self.all_arguments: raise e - if args.overdrive: try: overdrive_level = amdsmi_interface.amdsmi_dev_get_overdrive_level(args.gpu) @@ -873,29 +914,34 @@ class AMDSMICommands(): values_dict['overdrive'] = overdrive_level except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['overdrive'] = e.get_error_info() + if not self.all_arguments: + raise e if args.mem_overdrive: - values_dict['mem_overdrive'] = amdsmi_interface.AmdSmiRetCode.NOT_IMPLEMENTED - + values_dict['mem_overdrive'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info if args.perf_level: try: perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) values_dict['perf_level'] = perf_level except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['perf_level'] = e.get_error_info() + if not self.all_arguments: + raise e if args.replay_count: try: pci_replay_counter = amdsmi_interface.amdsmi_dev_get_pci_replay_counter(args.gpu) values_dict['replay_count'] = pci_replay_counter except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['replay_count'] = e.get_error_info() + if not self.all_arguments: + raise e if args.xgmi_err: try: values_dict['xgmi_err'] = amdsmi_interface.amdsmi_dev_xgmi_error_status(args.gpu) except amdsmi_interface.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.AmdSmiRetCode.ERR_NOT_SUPPORTED: + if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NOT_SUPPORTED: values_dict['xgmi_err'] = 'N/A' - else: + elif not self.all_arguments: raise e if args.energy: try: @@ -907,14 +953,16 @@ class AMDSMICommands(): values_dict['energy'] = energy except amdsmi_exception.AmdSmiLibraryException as e: - raise e + values_dict['energy'] = e.get_error_info() + if not self.all_arguments: + raise e if args.mem_usage: + memory_total = {} try: memory_total_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) memory_total_vis_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) memory_total_gtt = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) - memory_total = {} # Convert mem_usage to megabytes memory_total['vram'] = memory_total_vram // (1024*1024) memory_total['vis_vram'] = memory_total_vis_vram // (1024*1024) @@ -927,10 +975,36 @@ class AMDSMICommands(): memory_total['vis_vram'] = f"{memory_total['vis_vram']} {unit}" memory_total['gtt'] = f"{memory_total['gtt']} {unit}" - - values_dict['mem_usage'] = memory_total except amdsmi_exception.AmdSmiLibraryException as e: - raise e + memory_total['vram'] = e.get_error_info() + memory_total['vis_vram'] = e.get_error_info() + memory_total['gtt'] = e.get_error_info() + if not self.all_arguments: + raise e + + try: + total_used_vram = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) + total_used_vis_vram = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) + total_used_gtt = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) + + # Convert mem_usage to megabytes + memory_total['used_vram'] = total_used_vram // (1024*1024) + memory_total['used_vis_vram'] = total_used_vis_vram // (1024*1024) + memory_total['used_gtt'] = total_used_gtt // (1024*1024) + + if self.logger.is_human_readable_format(): + memory_total['used_vram'] = f"{memory_total['used_vram']} {unit}" + memory_total['used_vis_vram'] = f"{memory_total['used_vis_vram']} {unit}" + memory_total['used_gtt'] = f"{memory_total['used_gtt']} {unit}" + + except amdsmi_exception.AmdSmiLibraryException as e: + memory_total['used_vram'] = e.get_error_info() + memory_total['used_vis_vram'] = e.get_error_info() + memory_total['used_gtt'] = e.get_error_info() + if not self.all_arguments: + raise e + + values_dict['mem_usage'] = memory_total # Store values in logger.output self.logger.store_output(args.gpu, 'values', values_dict) @@ -1126,6 +1200,7 @@ class AMDSMICommands(): for thread in threads: thread.join() + def topology(self, args, multiple_devices=False, gpu=None, access=None, weight=None, hops=None, type=None, numa=None, numa_bw=None): """ Get topology information for target gpus @@ -1196,6 +1271,7 @@ class AMDSMICommands(): if args.numa_bw: pass + def set_value(self, args, multiple_devices=False, gpu=None, clock=None, sclk=None, mclk=None, pcie=None, slevel=None, mlevel=None, vc=None, srange=None, mrange=None, fan=None, perflevel=None, overdrive=None, memoverdrive=None, @@ -1277,6 +1353,18 @@ class AMDSMICommands(): args.gpu = device_handle + # Build GPU string for errors + try: + gpu_bdf = amdsmi_interface.amdsmi_get_device_bdf(args.gpu) + except amdsmi_exception.AmdSmiLibraryException: + gpu_bdf = f'BDF Unavailable for {args.gpu}' + try: + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + except IndexError: + gpu_id = f'ID Unavailable for {args.gpu}' + gpu_string = f"GPU ID: {gpu_id} BDF:{gpu_bdf}" + + # Handle args if args.clock: clock_type, freq_bitmask = args.clock @@ -1284,231 +1372,232 @@ class AMDSMICommands(): try: perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get performance level of {args.gpu}") - - if 'manual' in perf_level.lower(): - try: - amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual") - - if clock_type != amdsmi_interface.AmdSmiClkType.PCIE.value: - try: - amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask) - except amdsmi_exception.AmdSmiLibraryException as e: - clock_type = amdsmi_interface.AmdSmiClkType(clock_type) - raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}") - print(f'Successfully set frequency bitmask on {args.gpu}') - else: - try: - amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask) - except amdsmi_exception.AmdSmiLibraryException as e: - clock_type = amdsmi_interface.AmdSmiClkType(clock_type) - raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}") - print(f'Successfully set frequency bitmask on {args.gpu}') - - if args.sclk: - freq_bitmask = args.sclk - clock_type = amdsmi_interface.AmdSmiClkType.SYS - # Check if the performance level is manual, if not then set it to manual - try: - perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get performance level of {args.gpu}") - - if 'manual' in perf_level.lower(): - try: - amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual") - - try: - amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type.value, freq_bitmask) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}") - print(f'Successfully set frequency bitmask on {args.gpu}') - - if args.mclk: - freq_bitmask = args.sclk - clock_type = amdsmi_interface.AmdSmiClkType.MEM - # Check if the performance level is manual, if not then set it to manual - try: - perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get performance level of {args.gpu}") - - if 'manual' in perf_level.lower(): - try: - amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual") - - try: - amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type.value, freq_bitmask) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}") - print(f'Successfully set frequency bitmask on {args.gpu}') - - if args.pcie: - freq_bitmask = args.sclk - clock_type = amdsmi_interface.AmdSmiClkType.PCIE - # Check if the performance level is manual, if not then set it to manual - try: - perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get performance level of {args.gpu}") - - if 'manual' in perf_level.lower(): - try: - amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual") - try: - amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}") - print(f'Successfully set frequency bitmask on {args.gpu}') - - if args.slevel: - level, value = args.slevel - level = amdsmi_interface.AmdSmiFreqInd(level).value - clock_type = amdsmi_interface.AmdSmiClkType.SYS - try: - amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {args.gpu}") - print(f'Successfully changed clock frequency on {args.gpu}') - - if args.mlevel: - level, value = args.mlevel - level = amdsmi_interface.AmdSmiFreqInd(level).value - clock_type = amdsmi_interface.AmdSmiClkType.MEM - try: - amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {args.gpu}") - print(f'Successfully changed clock frequency on {args.gpu}') - - if args.vc: - point, clk, volt = args.vc - try: - amdsmi_interface.amdsmi_dev_set_od_volt_info(args.gpu, point, clk, volt) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the Voltage Curve point {point} to {clk}(MHz) {volt}(mV) on {args.gpu}") - print(f'Successfully set voltage point {point} to {clk}(MHz) {volt}(mV) on {args.gpu}') - - if args.srange: - min_value, max_value = args.srange - clock_type = amdsmi_interface.AmdSmiClkType.SYS - try: - amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}") - print(f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}") - - if args.mrange: - min_value, max_value = args.srange - clock_type = amdsmi_interface.AmdSmiClkType.MEM - try: - amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type.value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}") - print(f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}") - - if args.fan: - try: - amdsmi_interface.amdsmi_dev_set_fan_speed(args.gpu, 0, args.fan) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set fan speed {args.fan} on {args.gpu}") - print(f"Successfully set fan speed {args.fan} on {args.gpu}") - - if args.perflevel: - perf_levels = amdsmi_interface.amdsmi_wrapper.amdsmi_dev_perf_level_t__enumvalues - for value in perf_levels: - if args.perflevel.lower() in perf_levels[value]: - try: - amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, value) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set performance level {args.perflevel} on {args.gpu}") - print(f"Successfully set performance level {args.perflevel} on {args.gpu}") - break - - if args.overdrive or args.overdrive == 0: - # Check if the performance level is manual, if not then set it to manual - try: - perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get performance level of {args.gpu}") + raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): try: amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual") + raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e - try: - amdsmi_interface.amdsmi_dev_set_overdrive_level_v1(args.gpu, args.overdrive) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set overdrive {args.overdrive} to {args.gpu}") - print(f"Successfully to set overdrive {args.overdrive} to {args.gpu}") + if clock_type != amdsmi_interface.AmdSmiClkType.PCIE: + try: + amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e + else: + try: + amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e - if args.memoverdrive or args.memoverdrive == 0: + self.logger.store_output(args.gpu, 'clock', f'Successfully set clock frequency bitmask for {clock_type}') + + if isinstance(args.sclk, int): + freq_bitmask = args.sclk + clock_type = amdsmi_interface.AmdSmiClkType.SYS # Check if the performance level is manual, if not then set it to manual try: perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get performance level of {args.gpu}") + raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): try: - amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value) + amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual") + raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e - if args.poweroverdrive: + try: + amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'sclk', 'Successfully set clock frequency bitmask') + if isinstance(args.mclk, int): + freq_bitmask = args.mclk + clock_type = amdsmi_interface.AmdSmiClkType.MEM + # Check if the performance level is manual, if not then set it to manual + try: + perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get performance level of {gpu_string}") from e + + if 'manual' in perf_level.lower(): + try: + amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e + + try: + amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'mclk', 'Successfully set clock frequency bitmask') + if isinstance(args.pcie, int): + freq_bitmask = args.pcie + clock_type = amdsmi_interface.AmdSmiClkType.PCIE + # Check if the performance level is manual, if not then set it to manual + try: + perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get performance level of {gpu_string}") from e + + if 'manual' in perf_level.lower(): + try: + amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e + try: + amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'pcie', 'Successfully set clock frequency bitmask') + if isinstance(args.slevel, int): + level, value = args.slevel + level = amdsmi_interface.AmdSmiFreqInd(level) + clock_type = amdsmi_interface.AmdSmiClkType.SYS + try: + amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'slevel', 'Successfully changed clock frequency') + if isinstance(args.mlevel, int): + level, value = args.mlevel + level = amdsmi_interface.AmdSmiFreqInd(level) + clock_type = amdsmi_interface.AmdSmiClkType.MEM + try: + amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'mlevel', 'Successfully changed clock frequency') + if isinstance(args.vc, int): + point, clk, volt = args.vc + try: + amdsmi_interface.amdsmi_dev_set_od_volt_info(args.gpu, point, clk, volt) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the Voltage Curve point {point} to {clk}(MHz) {volt}(mV) on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'vc', f'Successfully set voltage point {point} to {clk}(MHz) {volt}(mV)') + if isinstance(args.srange, int): + min_value, max_value = args.srange + clock_type = amdsmi_interface.AmdSmiClkType.SYS + try: + amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'srange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)") + if isinstance(args.mrange, int): + min_value, max_value = args.srange + clock_type = amdsmi_interface.AmdSmiClkType.MEM + try: + amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'mrange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)") + if isinstance(args.fan, int): + try: + amdsmi_interface.amdsmi_dev_set_fan_speed(args.gpu, 0, args.fan) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}") + if args.perflevel: + perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perflevel] + try: + amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, perf_level) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perflevel}") + if isinstance(args.overdrive, int): + # Check if the performance level is manual, if not then set it to manual + try: + perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get performance level of {gpu_string}") from e + + if 'manual' in perf_level.lower(): + try: + amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e + + try: + amdsmi_interface.amdsmi_dev_set_overdrive_level_v1(args.gpu, args.overdrive) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set overdrive {args.overdrive} to {gpu_string}") from e + + self.logger.store_output(args.gpu, 'overdrive', f"Successfully to set overdrive level to {args.overdrive}") + if isinstance(args.memoverdrive, int): + # Check if the performance level is manual, if not then set it to manual + try: + perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get performance level of {gpu_string}") from e + + if 'manual' in perf_level.lower(): + try: + amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e + + self.logger.store_output(args.gpu, 'memoverdrive', f"Successfully to set memoverdrive level to {args.memoverdrive}") + if isinstance(args.poweroverdrive, int): overdrive_power_cap = args.poweroverdrive try: power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get the power cap info for {args.gpu}") + raise ValueError(f"Unable to get the power cap info for {gpu_string}") from e if overdrive_power_cap == 0: overdrive_power_cap = power_caps['power_cap_default'] else: overdrive_power_cap *= 1000000 if overdrive_power_cap < power_caps['min_power_cap']: - raise ValueError(self, f"Requested power cap: {overdrive_power_cap} is lower than the min power cap: {power_caps['min_power_cap']}") + raise ValueError(f"Requested power cap: {overdrive_power_cap} is lower than the min power cap: {power_caps['min_power_cap']}") if overdrive_power_cap > power_caps['max_power_cap']: - raise ValueError(self, f"Requested power cap: {overdrive_power_cap} is greater than the max power cap: {power_caps['max_power_cap']}") + raise ValueError(f"Requested power cap: {overdrive_power_cap} is greater than the max power cap: {power_caps['max_power_cap']}") if overdrive_power_cap == power_caps['power_cap']: - raise ValueError(self, f"Requested power cap: {overdrive_power_cap} is the same as the current power cap: {power_caps['power_cap']}") + raise ValueError(f"Requested power cap: {overdrive_power_cap} is the same as the current power cap: {power_caps['power_cap']}") try: amdsmi_interface.amdsmi_dev_set_power_cap(args.gpu, 0, overdrive_power_cap) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set power cap to {overdrive_power_cap} on {args.gpu}") + raise ValueError(f"Unable to set power cap to {overdrive_power_cap} on {gpu_string}") from e try: power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get the power cap info for {args.gpu} post set") + raise ValueError(f"Unable to get the power cap info for {gpu_string} post set") from e if power_caps['power_cap'] == overdrive_power_cap: - print(f"Successfully set the power cap {overdrive_power_cap} on {args.gpu}") + self.logger.store_output(args.gpu, 'power_cap', f"Successfully set the power cap {overdrive_power_cap}") else: - raise ValueError(self, f"Power cap: {overdrive_power_cap} set failed on {args.gpu}") - + raise ValueError(f"Power cap: {overdrive_power_cap} set failed on {gpu_string}") if args.profile: - print(amdsmi_interface.AmdSmiRetCode.NOT_IMPLEMENTED) - - if args.perfdeterminism: + self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented") + if isinstance(args.perfdeterminism, int): try: amdsmi_interface.amdsmi_set_perf_determinism_mode(args.gpu, args.perfdeterminism) except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {args.gpu}") - print(f"Successfully enabled performance determinism and set GFX clock frequency to {args.perfdeterminism} on {args.gpu}") + raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perfdeterminism}") + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + self.logger.print_output() def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, @@ -1652,7 +1741,7 @@ class AMDSMICommands(): def rocm_smi(self, args): - print("Placeholder for rocm-smi legacy commandss") + print("Placeholder for rocm-smi legacy commands") def _event_thread(self, commands, i): diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 73a7609412..152c554e69 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -22,6 +22,7 @@ import logging import platform +import sys import time from pathlib import Path @@ -118,6 +119,20 @@ class AMDSMIHelpers(): return self._is_windows + def get_output_format(self): + """Returns the output format read from sys.argv + Returns: + str: outputformat + """ + args = sys.argv[1:] + outputformat = "human" + if "--json" in args or "--j" in args: + outputformat = "json" + elif "--csv" in args or "--c" in args: + outputformat = "csv" + return outputformat + + def get_gpu_choices(self): """Return dictionary of possible GPU choices and string of the output: Dictionary will be in format: gpus[ID] : (BDF, UUID, Device Handle) @@ -307,11 +322,34 @@ class AMDSMIHelpers(): return asic_info['vendor_id'] == AMD_VENDOR_ID - def is_valid_clock_type(self, clock_type): - if clock_type in amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues: - return True, amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues.keys() - else: - return False, amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues.keys() + def get_perf_levels(self): + perf_levels_str = [clock.name for clock in amdsmi_interface.AmdSmiDevPerfLevel] + perf_levels_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiDevPerfLevel)) + return perf_levels_str, perf_levels_int + + + def get_clock_types(self): + clock_types_str = [clock.name for clock in amdsmi_interface.AmdSmiClkType] + clock_types_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiClkType)) + return clock_types_str, clock_types_int + + + def validate_clock_type(self, input_clock_type): + valid_clock_types_str, valid_clock_types_int = self.get_clock_types() + + valid_clock_input = False + if isinstance(input_clock_type, str): + for clock_type in valid_clock_types_str: + if input_clock_type.lower() == clock_type.lower(): + input_clock_type = clock_type # Set input_clock_type to enum value in AmdSmiClkType + valid_clock_input = True + break + elif isinstance(input_clock_type, int): + if input_clock_type in valid_clock_types_int: + input_clock_type = amdsmi_interface.AmdSmiClkType(input_clock_type) + valid_clock_input = True + + return valid_clock_input, input_clock_type def confirm_out_of_spec_warning(self, auto_respond=False): @@ -348,15 +386,3 @@ class AMDSMIHelpers(): return True, profile_presets[profile] else: return False, profile_presets.values() - - - def get_perf_level(self, device_handle): - """ Return the current performance level of a given device - - @param device_handle: DRM device identifier - """ - - try: - ret = amdsmi_interface.amdsmi_dev_get_perf_level(device_handle) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(self, f"Unable to get performance level of {device_handle}") diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 5e87cfa2c2..8f71ed5d29 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -31,20 +31,26 @@ import sys from _version import __version__ from amdsmi_helpers import AMDSMIHelpers import amdsmi_cli_exceptions -from BDF import BDF class AMDSMIParser(argparse.ArgumentParser): + """Unified Parser for AMDSMI CLI. + This parser doesn't access amdsmi's lib directly,but via AMDSMIHelpers, + this allows for us to use this parser with future OS & Platform integration. + + Args: + argparse (ArgumentParser): argparse.ArgumentParser + """ def __init__(self, version, discovery, static, firmware, bad_pages, metric, process, profile, event, topology, set_value, reset, rocmsmi): # Helper variables - self.amdsmi_helpers = AMDSMIHelpers() - self.gpu_choices, self.gpu_choices_str = self.amdsmi_helpers.get_gpu_choices() + self.helpers = AMDSMIHelpers() + self.gpu_choices, self.gpu_choices_str = self.helpers.get_gpu_choices() self.vf_choices = ['3', '2', '1'] version_string = f"Version: {__version__}" - platform_string = f"Platform: {self.amdsmi_helpers.os_info()}" + platform_string = f"Platform: {self.helpers.os_info()}" # Adjust argument parser options super().__init__( @@ -83,12 +89,7 @@ class AMDSMIParser(argparse.ArgumentParser): if int_value.isdigit(): # Is digit works only on positive numbers return int(int_value) else: - args = sys.argv[1:] - outputformat = "human" - if "--json" in args or "--j" in args: - outputformat = "json" - elif "--csv" in args or "-c" in args: - outputformat = "csv" + outputformat = self.helpers.get_output_format() raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat) @@ -100,6 +101,7 @@ class AMDSMIParser(argparse.ArgumentParser): If the path is a file and it doesn't exist create and return the file path """ class CheckOutputFilePath(argparse.Action): + outputformat = self.helpers.get_output_format() # Checks the values def __call__(self, parser, args, values, option_string=None): path = Path(values) @@ -107,13 +109,7 @@ class AMDSMIParser(argparse.ArgumentParser): if path.parent.is_dir(): path.touch() else: - args = sys.argv[1:] - outputformat = "human" - if "--json" in args or "--j" in args: - outputformat = "json" - elif "--csv" in args or "-c" in args: - outputformat = "csv" - raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, outputformat) + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) if path.is_dir(): path = path / f"{int(time.time())}-amdsmi-output.txt" @@ -122,13 +118,7 @@ class AMDSMIParser(argparse.ArgumentParser): elif path.is_file(): setattr(args, self.dest, path) else: - args = sys.argv[1:] - outputformat = "human" - if "--json" in args or "--j" in args: - outputformat = "json" - elif "--csv" in args or "-c" in args: - outputformat = "csv" - raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, outputformat) + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) return CheckOutputFilePath @@ -178,8 +168,9 @@ class AMDSMIParser(argparse.ArgumentParser): If 1 or more device handles are not found then raise an ArgumentError for the first invalid gpu seen """ - amdsmi_helpers = self.amdsmi_helpers + amdsmi_helpers = self.helpers class _GPUSelectAction(argparse.Action): + ouputformat=self.helpers.get_output_format() # Checks the values def __call__(self, parser, args, values, option_string=None): status, selected_device_handles = amdsmi_helpers.get_device_handles_from_gpu_selections(gpu_selections=values, @@ -187,17 +178,10 @@ class AMDSMIParser(argparse.ArgumentParser): if status: setattr(args, self.dest, selected_device_handles) else: - invalid_selection = selected_device_handles - args = sys.argv[1:] - outputformat = "human" - if "--json" in args or "--j" in args: - outputformat = "json" - elif "--csv" in args or "-c" in args: - outputformat = "csv" - if invalid_selection == '': - raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", outputformat) + if selected_device_handles == '': + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", _GPUSelectAction.ouputformat) else: - raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(invalid_selection, outputformat) + raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, _GPUSelectAction.ouputformat) return _GPUSelectAction @@ -221,6 +205,21 @@ class AMDSMIParser(argparse.ArgumentParser): choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]) + def _add_watch_arguments(self, subcommand_parser): + # Device arguments help text + watch_help = "Reprint the command in a loop of Interval seconds" + watch_time_help = "The total time to watch the given command" + iterations_help = "Total number of iterations to loop on the given command" + + # Mutually Exclusive Args within the subparser + subcommand_parser.add_argument('-w', '--watch', action='store', metavar='loop_time', + type=self._positive_int, required=False, help=watch_help) + subcommand_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='total_loop_time', + type=self._positive_int, required=False, help=watch_time_help) + subcommand_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='number_of_iterations', + type=self._positive_int, required=False, help=iterations_help) + + def _add_device_arguments(self, subcommand_parser, required=False): # Device arguments help text gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}" @@ -232,7 +231,7 @@ class AMDSMIParser(argparse.ArgumentParser): device_args.add_argument('-g', '--gpu', action=self._gpu_select(self.gpu_choices), nargs='+', help=gpu_help) - if self.amdsmi_helpers.is_hypervisor(): + if self.helpers.is_hypervisor(): device_args.add_argument('-v', '--vf', action='store', nargs='+', help=vf_help, choices=self.vf_choices) @@ -313,13 +312,13 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-c', '--caps', action='store_true', required=False, help=caps_help) # Options to display on Hypervisors and Baremetal - if self.amdsmi_helpers.is_hypervisor() or self.amdsmi_helpers.is_baremetal(): + if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) - if self.amdsmi_helpers.is_linux(): + if self.helpers.is_linux(): static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) # Options to only display on a Hypervisor - if self.amdsmi_helpers.is_hypervisor(): + if self.helpers.is_hypervisor(): static_parser.add_argument('-d', '--dfc-ucode', action='store_true', required=False, help=dfc_help) static_parser.add_argument('-f', '--fb-info', action='store_true', required=False, help=fb_help) static_parser.add_argument('-n', '--num-vf', action='store_true', required=False, help=num_vf_help) @@ -349,12 +348,12 @@ class AMDSMIParser(argparse.ArgumentParser): firmware_parser.add_argument('-f', '--ucode-list', '--fw-list', dest='fw_list', action='store_true', required=False, help=fw_list_help, default=True) # Options to only display on a Hypervisor - if self.amdsmi_helpers.is_hypervisor(): + if self.helpers.is_hypervisor(): firmware_parser.add_argument('-e', '--error-records', action='store_true', required=False, help=err_records_help) def _add_bad_pages_parser(self, subparsers, func): - if not (self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()): + if not (self.helpers.is_baremetal() and self.helpers.is_linux()): # The bad_pages subcommand is only applicable to Linux Baremetal systems return @@ -369,7 +368,7 @@ class AMDSMIParser(argparse.ArgumentParser): un_res_help = "Displays unreservable pages" # Create bad_pages subparser - bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help, aliases=['bad_pages']) + bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help) bad_pages_parser._optionals.title = bad_pages_optionals_title bad_pages_parser.formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=80, width=90) bad_pages_parser.set_defaults(func=func) @@ -393,9 +392,6 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional arguments help text usage_help = "Displays engine usage information" - watch_help = "Reprint the command in a loop of Interval seconds" - watch_time_help = "The total time to watch the given command" - iterations_help = "Total number of iterations to loop on the given command" # Help text for Arguments only Available on Virtual OS and Baremetal platforms fb_usage_help = "Total and used framebuffer" @@ -410,7 +406,6 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Linux Baremetal platforms fan_help = "Current fan speed" - pcie_usage_help = "Estimated PCIe link usage" vc_help = "Display voltage curve" overdrive_help = "Current GPU clock overdrive level" mo_help = "Current memory clock overdrive level" @@ -435,21 +430,18 @@ class AMDSMIParser(argparse.ArgumentParser): # Add Device args self._add_device_arguments(metric_parser, required=False) + # Add Watch args + self._add_watch_arguments(metric_parser) + # Optional Args metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help) - metric_parser.add_argument('-w', '--watch', action='store', metavar='Interval', - type=self._positive_int, required=False, help=watch_help) - metric_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='Duration', - type=self._positive_int, required=False, help=watch_time_help) - metric_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='Iterations', - type=self._positive_int, required=False, help=iterations_help) # Optional Args for Virtual OS and Baremetal systems - if self.amdsmi_helpers.is_virtual_os() or self.amdsmi_helpers.is_baremetal(): + if self.helpers.is_virtual_os() or self.helpers.is_baremetal(): metric_parser.add_argument('-b', '--fb-usage', action='store_true', required=False, help=fb_usage_help) # Optional Args for Hypervisors and Baremetal systems - if self.amdsmi_helpers.is_hypervisor() or self.amdsmi_helpers.is_baremetal(): + if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help) metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help) metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) @@ -458,9 +450,8 @@ class AMDSMIParser(argparse.ArgumentParser): metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help) # Optional Args for Linux Baremetal Systems - if self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux(): + if self.helpers.is_baremetal() and self.helpers.is_linux(): metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) - metric_parser.add_argument('-s', '--pcie-usage', action='store_true', required=False, help=pcie_usage_help) metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) metric_parser.add_argument('-M', '--mem-overdrive', action='store_true', required=False, help=mo_help) @@ -471,14 +462,14 @@ class AMDSMIParser(argparse.ArgumentParser): metric_parser.add_argument('-m', '--mem-usage', action='store_true', required=False, help=mem_usage_help) # Options to only display to Hypervisors - if self.amdsmi_helpers.is_hypervisor(): + if self.helpers.is_hypervisor(): metric_parser.add_argument('-s', '--schedule', action='store_true', required=False, help=schedule_help) metric_parser.add_argument('-G', '--guard', action='store_true', required=False, help=guard_help) metric_parser.add_argument('-u', '--guest', action='store_true', required=False, help=guest_help) def _add_process_parser(self, subparsers, func): - if self.amdsmi_helpers.is_hypervisor(): + if self.helpers.is_hypervisor(): # Don't add this subparser on Hypervisors # This subparser is only available to Guest and Baremetal systems return @@ -495,9 +486,7 @@ class AMDSMIParser(argparse.ArgumentParser): pid_help = "Gets all process information about the specified process based on Process ID" name_help = "Gets all process information about the specified process based on Process Name.\ \nIf multiple processes have the same name information is returned for all of them." - watch_help = "Reprint the command in a loop of Interval seconds" - watch_time_help = "The total time to watch the given command" - iterations_help = "Total number of iterations to loop on the given command" + # Create process subparser process_parser = subparsers.add_parser('process', help=process_help, description=process_subcommand_help) @@ -509,21 +498,18 @@ class AMDSMIParser(argparse.ArgumentParser): # Add Device args self._add_device_arguments(process_parser, required=False) + # Add Watch args + self._add_watch_arguments(process_parser) + # Optional Args process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help) process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) process_parser.add_argument('-p', '--pid', action='store', type=self._positive_int, required=False, help=pid_help) process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help) - process_parser.add_argument('-w', '--watch', action='store', metavar='Interval', - type=self._positive_int, required=False, help=watch_help) - process_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='Duration', - type=self._positive_int, required=False, help=watch_time_help) - process_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='Iterations', - type=self._positive_int, required=False, help=iterations_help) def _add_profile_parser(self, subparsers, func): - if not (self.amdsmi_helpers.is_windows() and self.amdsmi_helpers.is_hypervisor()): + if not (self.helpers.is_windows() and self.helpers.is_hypervisor()): # This subparser only applies to Hypervisors return @@ -544,7 +530,7 @@ class AMDSMIParser(argparse.ArgumentParser): def _add_event_parser(self, subparsers, func): - if self.amdsmi_helpers.is_linux() and not self.amdsmi_helpers.is_virtual_os(): + if self.helpers.is_linux() and not self.helpers.is_virtual_os(): # This subparser only applies to Linux BareMetal & Linux Hypervisors, NOT Linux Guest return @@ -566,7 +552,7 @@ class AMDSMIParser(argparse.ArgumentParser): def _add_topology_parser(self, subparsers, func): return - if not(self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()): + if not(self.helpers.is_baremetal() and self.helpers.is_linux()): # This subparser is only applicable to Baremetal Linux return @@ -599,11 +585,11 @@ class AMDSMIParser(argparse.ArgumentParser): topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help) topology_parser.add_argument('-t', '--type', action='store_true', required=False, help=type_help) topology_parser.add_argument('-n', '--numa', action='store_true', required=False, help=numa_help) - topology_parser.add_argument('-b', '--numa_bw', action='store_true', required=False, help=numa_bw_help) + topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help) def _add_set_value_parser(self, subparsers, func): - if not(self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()): + if not(self.helpers.is_baremetal() and self.helpers.is_linux()): # This subparser is only applicable to Baremetal Linux return @@ -628,7 +614,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_mem_overdrive_help = "Set memory overclock overdrive level ***DEPRECATED IN NEWER KERNEL VERSIONS (use --mlevel instead)***" set_power_overdrive_help = "Set the maximum GPU power using power overdrive in Watts" set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes" - set_perf_det_help = "Set GPU clock frequency limit to get minimal performance variation" + set_perf_det_help = "Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation" # Create set_value subparser set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) @@ -641,7 +627,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_device_arguments(set_value_parser, required=True) # Optional Args - set_value_parser.add_argument('-c', '--clock', action=self._validate_set_clock(True), nargs='+', type=self._positive_int, required=False, help=set_clock_help, metavar=('CLK_TYPE', 'CLK_LEVELS')) + set_value_parser.add_argument('-c', '--clock', action=self._validate_set_clock(True), nargs='+', required=False, help=set_clock_help, metavar=('CLK_TYPE', 'CLK_LEVELS')) set_value_parser.add_argument('-s', '--sclk', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_sclk_help, metavar='CLK_LEVELS') set_value_parser.add_argument('-m', '--mclk', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_mclk_help, metavar='CLK_LEVELS') set_value_parser.add_argument('-p', '--pcie', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_pcie_help, metavar='CLK_LEVELS') @@ -651,7 +637,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-r', '--srange', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_srange_help, metavar=('SCLKMIN', 'SCLKMAX')) set_value_parser.add_argument('-R', '--mrange', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_mrange_help, metavar=('MCLKMIN', 'MCLKMAX')) set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%') - set_value_parser.add_argument('-l', '--perflevel', action='store', choices=['auto', 'low', 'high', 'manual'], required=False, help=set_perf_level_help, metavar='LEVEL') + set_value_parser.add_argument('-l', '--perflevel', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') set_value_parser.add_argument('-o', '--overdrive', action=self._validate_overdrive_percent(), required=False, help=set_overdrive_help, metavar='%') set_value_parser.add_argument('-O', '--memoverdrive', action=self._validate_overdrive_percent(), required=False, help=set_mem_overdrive_help, metavar='%') set_value_parser.add_argument('-w', '--poweroverdrive', action=self._prompt_spec_warning(), type=self._positive_int, required=False, help=set_power_overdrive_help, metavar="WATTS") @@ -661,13 +647,14 @@ class AMDSMIParser(argparse.ArgumentParser): def _validate_set_clock(self, validate_clock_type=True): """ Validate Clock input""" - amdsmi_helpers = self.amdsmi_helpers + amdsmi_helpers = self.helpers class _ValidateClockType(argparse.Action): - # Checks the values + # Checks the clock type and clock values def __call__(self, parser, args, values, option_string=None): if validate_clock_type: clock_type = values[0] - valid_clock_type, clock_types = amdsmi_helpers.is_valid_clock_type(clock_type=clock_type) + clock_types = amdsmi_helpers.get_clock_types()[0] + valid_clock_type, amdsmi_clock_type = amdsmi_helpers.validate_clock_type(input_clock_type=clock_type) if not valid_clock_type: raise argparse.ArgumentError(self, f"Invalid argument: '{clock_type}' needs to be a valid clock type:{clock_types}") @@ -682,7 +669,7 @@ class AMDSMIParser(argparse.ArgumentParser): freq_bitmask |= (1 << level) if validate_clock_type: - setattr(args, self.dest, (clock_type, freq_bitmask)) + setattr(args, self.dest, (amdsmi_clock_type, freq_bitmask)) else: setattr(args, self.dest, freq_bitmask) return _ValidateClockType @@ -690,7 +677,7 @@ class AMDSMIParser(argparse.ArgumentParser): def _prompt_spec_warning(self): """ Prompt out of spec warning""" - amdsmi_helpers = self.amdsmi_helpers + amdsmi_helpers = self.helpers class _PromptSpecWarning(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): @@ -701,57 +688,58 @@ class AMDSMIParser(argparse.ArgumentParser): def _validate_fan_speed(self): """ Validate fan speed input""" - amdsmi_helpers = self.amdsmi_helpers + amdsmi_helpers = self.helpers class _ValidateFanSpeed(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): - - # Convert percentage to fan level if isinstance(values, str): - try: - values = int(values[:-1]) // 100 * 255 - except ValueError as e: - raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%") - - # Store the fan level as fan_speed - if isinstance(values, int): - if 0 <= values <= 255: - amdsmi_helpers.confirm_out_of_spec_warning() - setattr(args, self.dest, values) - else: - raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%") - + # Convert percentage to fan level + if '%' in values: + try: + amdsmi_helpers.confirm_out_of_spec_warning() + values = int(int(values[:-1]) / 100 * 255) + setattr(args, self.dest, values) + except ValueError as e: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-100%") + else: # Store the fan level as fan_speed + values = int(values) + if 0 <= values <= 255: + amdsmi_helpers.confirm_out_of_spec_warning() + setattr(args, self.dest, values) + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255") + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%") return _ValidateFanSpeed def _validate_overdrive_percent(self): """ Validate overdrive percentage input""" - amdsmi_helpers = self.amdsmi_helpers + amdsmi_helpers = self.helpers class _ValidateOverdrivePercent(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): if isinstance(values, str): try: if values[-1] == '%': - values = int(values[:-1]) + over_drive_percent = int(values[:-1]) else: - values = int(values) - except ValueError as e: - raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%") + over_drive_percent = int(values) - if isinstance(values, int): - if 0 <= values <= 20: - over_drive_percent = values - else: + if 0 <= over_drive_percent <= 20: + amdsmi_helpers.confirm_out_of_spec_warning() + setattr(args, self.dest, over_drive_percent) + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be within range 0-20 or 0-20%") + except ValueError: raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%") - - amdsmi_helpers.confirm_out_of_spec_warning() - setattr(args, self.dest, over_drive_percent) + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%") return _ValidateOverdrivePercent def _add_reset_parser(self, subparsers, func): - if not(self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()): + if not(self.helpers.is_baremetal() and self.helpers.is_linux()): # This subparser is only applicable to Baremetal Linux return @@ -823,6 +811,7 @@ class AMDSMIParser(argparse.ArgumentParser): rocm_smi_parser.add_argument('-l', '--load', action=self._check_input_file_path(), type=str, required=False, help=load_help) rocm_smi_parser.add_argument('-s', '--save', action=self._check_output_file_path(), type=str, required=False, help=save_help) + rocm_smi_parser.add_argument('-b', '--showbw', action='store_true', required=False, help=showbw_help) rocm_smi_parser.add_argument('-t', '--showtempgraph', action='store_true', required=False, help=showtempgraph_help) rocm_smi_parser.add_argument('-m', '--showmclkrange', action='store_true', required=False, help=showmclkrange_help) rocm_smi_parser.add_argument('-c', '--showsclkrange', action='store_true', required=False, help=showsclkrange_help) @@ -832,13 +821,10 @@ class AMDSMIParser(argparse.ArgumentParser): rocm_smi_parser.add_argument('-v', '--showclkvolt', action='store_true', required=False, help=showclkvolt_help) rocm_smi_parser.add_argument('-f', '--showclkfrq', action='store_true', required=False, help=showclkfrq_help) + def error(self, message): - args = sys.argv[1:] - outputformat = "human" - if "--json" in args or "--j" in args: - outputformat = "json" - elif "--csv" in args or "-c" in args: - outputformat = "csv" + outputformat = self.helpers.get_output_format() + if "argument : invalid choice: " in message: l = len("argument : invalid choice: ") + 1 message = message[l:] diff --git a/py-interface/_version.py b/py-interface/_version.py index a0235ce508..e34424611d 100644 --- a/py-interface/_version.py +++ b/py-interface/_version.py @@ -1 +1 @@ -__version__ = "0.0.2" \ No newline at end of file +__version__ = "0.0.3" \ No newline at end of file diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 58e47c187c..0542fed5da 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2437,7 +2437,7 @@ def amdsmi_dev_get_ecc_count( ec = amdsmi_wrapper.amdsmi_error_count_t() _check_res( - amdsmi_wrapper. amdsmi_dev_get_ecc_count( + amdsmi_wrapper.amdsmi_dev_get_ecc_count( device_handle, block, ctypes.byref(ec)) ) diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index b4222da681..fbb3b5de19 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -168,17 +168,17 @@ def char_pointer_cast(string, encoding='utf-8'): _libraries = {} from pathlib import Path +libamd_smi_optrocm = Path(__file__).parents[3] / "/lib/libamd_smi.so" libamd_smi_cpack = Path("@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/libamd_smi.so") -libamd_smi_optrocm = Path("/opt/rocm/lib/libamd_smi.so") libamd_smi_parent_dir = Path(__file__).resolve().parent / "libamd_smi.so" -libamd_smi_cwd = Path.cwd() +libamd_smi_cwd = Path.cwd() / "libamd_smi.so" -if libamd_smi_cpack.is_file(): - # try to find library in install directory provided by CMake - _libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_cpack) -elif libamd_smi_optrocm.is_file(): +if libamd_smi_optrocm.is_file(): # try /opt/rocm/lib as a fallback _libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_optrocm) +elif libamd_smi_cpack.is_file(): + # try to find library in install directory provided by CMake + _libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_cpack) elif libamd_smi_parent_dir.is_file(): # try to fall back to parent directory _libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_parent_dir) @@ -187,7 +187,6 @@ else: _libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_cwd) - # values for enumeration 'c__EA_amdsmi_init_flags_t' c__EA_amdsmi_init_flags_t__enumvalues = { 0: 'AMDSMI_INIT_ALL_DEVICES', diff --git a/py-interface/pyproject.toml b/py-interface/pyproject.toml index f0ca8dc982..ce8aa66a12 100644 --- a/py-interface/pyproject.toml +++ b/py-interface/pyproject.toml @@ -10,7 +10,7 @@ name = "amdsmi" authors = [ {name = "AMD", email = "amd-smi.support@amd.com"}, ] -version = '0.1' +version = '0.3' license = {file = "amdsmi/LICENSE"} readme = {file = "amdsmi/README.md", content-type = "text/markdown"} description = "SMI LIB - AMD GPU Monitoring Library" diff --git a/tools/generator.py b/tools/generator.py index fb380b6c20..1e9d8aa588 100644 --- a/tools/generator.py +++ b/tools/generator.py @@ -106,17 +106,17 @@ def main(): library_path = os.path.join(os.path.dirname(__file__), library) line_to_replace = "_libraries['{}'] = ctypes.CDLL('{}')".format(library_name, library_path) new_line = f"""from pathlib import Path +libamd_smi_optrocm = Path(__file__).parents[3] / "/lib/{library_name}" libamd_smi_cpack = Path("@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/{library_name}") -libamd_smi_optrocm = Path("/opt/rocm/lib/{library_name}") libamd_smi_parent_dir = Path(__file__).resolve().parent / "{library_name}" -libamd_smi_cwd = Path.cwd() +libamd_smi_cwd = Path.cwd() / "{library_name}" -if libamd_smi_cpack.is_file(): - # try to find library in install directory provided by CMake - _libraries['{library_name}'] = ctypes.CDLL(libamd_smi_cpack) -elif libamd_smi_optrocm.is_file(): +if libamd_smi_optrocm.is_file(): # try /opt/rocm/lib as a fallback _libraries['{library_name}'] = ctypes.CDLL(libamd_smi_optrocm) +elif libamd_smi_cpack.is_file(): + # try to find library in install directory provided by CMake + _libraries['{library_name}'] = ctypes.CDLL(libamd_smi_cpack) elif libamd_smi_parent_dir.is_file(): # try to fall back to parent directory _libraries['{library_name}'] = ctypes.CDLL(libamd_smi_parent_dir)