SWDEV-381302 - Added Error handling for Set & Metric

Bug Fixes for Set and Fan
Updated lib_amdsmi.so directory access
Backwards compatability fixes for gpuv-smi

Change-Id: I3b7977859c750c1c3d6f41eaa761c81d8b9e5184
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Этот коммит содержится в:
Maisam Arif
2023-04-15 02:00:37 -05:00
родитель 06f12c4700
Коммит 0830e983d8
10 изменённых файлов: 611 добавлений и 481 удалений
+1 -1
Просмотреть файл
@@ -1 +1 @@
__version__ = "0.0.2"
__version__ = "0.0.3"
+115 -85
Просмотреть файл
@@ -1,6 +1,26 @@
#!/usr/bin/env python3
#
# Copyright (C) 2023 Advanced Micro Devices. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
import json
import sys
AMDSMI_ERROR_MESSAGES = {
0: "Sucess",
@@ -23,12 +43,12 @@ AMDSMI_ERROR_MESSAGES = {
17: "Out of bounds",
18: "Initialization error",
19: "Internal reference counter exceeded",
# Reserved for future error messages
30: "Device busy",
31: "Device Not found",
32: "Device not initialized",
33: "No more free slot",
# Reserved for future error messages
40: "No data was found for given input",
41: "Insufficient size for operation",
42: "Unexpected size of data was read",
@@ -41,142 +61,152 @@ def _get_error_message(error_code):
return "Generic error"
class AmdSmiException(Exception):
def __init__(self):
self.json_message = {}
self.csv_message = ''
self.stdout_message = ''
self.message = ''
self.output_format = ''
def __str__(self):
# Return message according to the current output format
if self.output_format == 'json':
self.message = json.dumps(self.json_message)
elif self.output_format == 'csv':
self.message = self.csv_message
else:
self.message = self.stdout_message
return self.message
class AmdSmiInvalidCommandException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -1
self.command = command
if outputformat == "json":
values = {}
values["error"] = "Command '{}' is invalid. Run '--help' for more info.".format(self.command)
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "Command '{}' is invalid. Run '--help' for more info.,".format(self.command) + str(self.value)
else:
self.message = "Command '{}' is invalid. Run '--help' for more info. Error code: {}".format(self.command, self.value)
self.output_format = outputformat
common_message = f"Command '{self.command}' is invalid. Run '--help' for more info."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiInvalidParameterException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -2
self.command = command
if outputformat == "json":
values = {}
values["error"] = "Parameter '{}' is invalid. Run '--help' for more info.".format(self.command)
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "Parameter '{}' is invalid. Run '--help' for more info.,".format(self.command) + str(self.value)
else:
self.message = "Parameter '{}' is invalid. Run '--help' for more info. Error code: {}".format(self.command, self.value)
self.output_format = outputformat
common_message = f"Parameter '{self.command}' is invalid. Run '--help' for more info."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiDeviceNotFoundException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -3
self.command = command
if outputformat == "json":
values = {}
values["error"] = "GPU Device with GPU_INDEX '{}' cannot be found on the system.".format(self.command)
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "GPU Device with GPU_INDEX '{}' cannot be found on the system.,".format(self.command) + str(self.value)
else:
self.message = "GPU Device with GPU_INDEX '{}' cannot be found on the system. Error code: {}".format(self.command, self.value)
self.output_format = outputformat
common_message = f"GPU Device with GPU_INDEX '{self.command}' cannot be found on the system."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiInvalidFilePathException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -4
self.command = command
if outputformat == "json":
values = {}
values["error"] = "Path '{}' cannot be found.".format(self.command)
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "Path '{}' cannot be found.,".format(self.command) + str(self.value)
else:
self.message = "Path '{}' cannot be found. Error code: {}".format(self.command, self.value)
self.output_format = outputformat
common_message = f"Path '{self.command}' cannot be found."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiInvalidParameterValueException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -5
self.command = command
if outputformat == "json":
values = {}
values["error"] = "Value '{}' is not of valid type or format. Run '--help' for more info.".format(self.command)
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "Value '{}' is not of valid type or format. Run '--help' for more info.,".format(self.command) + str(self.value)
else:
self.message = "Value '{}' is not of valid type or format. Run '--help' for more info. Error code: {}".format(self.command, self.value)
self.output_format = outputformat
common_message = f"Value '{self.command}' is not of valid type or format. Run '--help' for more info."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiMissingParameterValueException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -6
self.command = command
if outputformat == "json":
values = {}
values["error"] = "Parameter '{}' requires a value. Run '--help' for more info.".format(self.command)
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "Parameter '{}' requires a value. Run '--help' for more info.,".format(self.command) + str(self.value)
else:
self.message = "Parameter '{}' requires a value. Run '--help' for more info. Error code: {}".format(self.command, self.value)
self.output_format = outputformat
common_message = f"Parameter '{self.command}' requires a value. Run '--help' for more info."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiParameterNotSupportedException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -8
self.command = command
if outputformat == "json":
values = {}
values["error"] = "Parameter '{}' is not supported on the system. Run '--help' for more info.".format(self.command)
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "Parameter '{}' is not supported on the system. Run '--help' for more info.,".format(self.command) + str(self.value)
else:
self.message = "Parameter '{}' is not supported on the system. Run '--help' for more info. Error code: {}".format(self.command, self.value)
self.output_format = outputformat
common_message = f"Parameter '{self.command}' is not supported on the system. Run '--help' for more info."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiUnknownErrorException(AmdSmiException):
def __init__(self, command, outputformat):
super().__init__()
self.value = -100
self.command = command
if outputformat == "json":
values = {}
values["error"] = "An unknown error has occurred. Run 'help' for more info."
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "An unknown error has occurred. Run 'help' for more info.," + str(self.value)
else:
self.message = "An unknown error has occurred. Run 'help' for more info. Error code: {}".format(self.value)
self.output_format = outputformat
common_message = "An unknown error has occurred. Run 'help' for more info."
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
class AmdSmiAMDSMIErrorException(AmdSmiException):
def __init__(self, outputformat, error_code):
super().__init__()
self.value = -1000 - abs(error_code)
self.smilibcode = error_code
self.output_format = outputformat
if outputformat == "json":
values = {}
values["error"] = "AMDSMI has returned error '{}' - '{}'".format(self.value,
AMDSMI_ERROR_MESSAGES[abs(self.smilibcode)])
values["code"] = self.value
self.message = json.dumps(values)
elif outputformat == "csv":
self.message = "error,code\n" + "AMDSMI has returned error '{}' - '{}',".format(self.value, _get_error_message(self.smilibcode)) + str(self.value)
else:
self.message = "AMDSMI has returned error '{}' - '{}' Error code: {}".format(self.value, _get_error_message(self.smilibcode), self.value)
common_message = f"AMDSMI has returned error '{self.value}' - '{AMDSMI_ERROR_MESSAGES[abs(self.smilibcode)]}'"
self.json_message["error"] = common_message
self.json_message["code"] = self.value
self.csv_message = f"error,code\n{common_message}, {self.value}"
self.stdout_message = f"{common_message} Error code: {self.value}"
+336 -247
Просмотреть файл
@@ -236,6 +236,7 @@ class AMDSMICommands():
bus_info = e.get_error_info()
if not self.all_arguments:
raise e
try:
bus_output_info['bdf'] = amdsmi_interface.amdsmi_get_device_bdf(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
@@ -245,7 +246,6 @@ class AMDSMICommands():
bus_output_info.update(bus_info)
values_dict['bus'] = bus_output_info
if args.vbios:
try:
vbios_info = amdsmi_interface.amdsmi_get_vbios_info(args.gpu)
@@ -282,6 +282,7 @@ class AMDSMICommands():
power_limit = e.get_error_info()
if not self.all_arguments:
raise e
try:
temp_edge_limit = amdsmi_interface.amdsmi_dev_get_temp_metric(args.gpu,
amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL)
@@ -343,10 +344,6 @@ class AMDSMICommands():
try:
caps_info = amdsmi_interface.amdsmi_get_caps_info(args.gpu)
if self.logger.is_gpuvsmi_compatibility():
del caps_info['ras_supported']
caps_info['gfx'] = caps_info.pop('gfx')
if self.logger.is_human_readable_format():
for capability_name, capability_value in caps_info.items():
if isinstance(capability_value, list):
@@ -565,8 +562,8 @@ class AMDSMICommands():
def metric(self, args, multiple_devices=False, watching_output=False, gpu=None,
usage=None, watch=None, watch_time=None, iterations=None, fb_usage=None, power=None,
clock=None, temperature=None, ecc=None, pcie=None, voltage=None, fan=None,
pcie_usage=None, voltage_curve=None, overdrive=None, mem_overdrive=None,
perf_level=None, replay_count=None, xgmi_err=None, energy=None, mem_usage=None):
voltage_curve=None, overdrive=None, mem_overdrive=None, perf_level=None,
replay_count=None, xgmi_err=None, energy=None, mem_usage=None):
"""Get Metric information for target gpu
Args:
@@ -586,7 +583,6 @@ class AMDSMICommands():
pcie (bool, optional): Value override for args.pcie. Defaults to None.
voltage (bool, optional): Value override for args.voltage. Defaults to None.
fan (bool, optional): Value override for args.fan. Defaults to None.
pcie_usage (bool, optional): Value override for args.pcie_usage. Defaults to None.
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
overdrive (bool, optional): Value override for args.overdrive. Defaults to None.
mem_overdrive (bool, optional): Value override for args.mem_overdrive. Defaults to None.
@@ -630,8 +626,6 @@ class AMDSMICommands():
args.voltage = voltage
if fan:
args.fan = fan
if pcie_usage:
args.pcie_usage = pcie_usage
if voltage_curve:
args.voltage_curve = voltage_curve
if overdrive:
@@ -676,15 +670,13 @@ class AMDSMICommands():
else:
raise IndexError("args.gpu should not be an empty list")
# Check if any of the options have been set, if not then set them all to true
if not any([args.usage, args.fb_usage, args.power, args.clock, args.temperature, args.ecc, args.pcie, args.voltage, args.fan,
args.pcie_usage, args.voltage_curve, args.overdrive, args.mem_overdrive, args.perf_level,
args.replay_count, args.xgmi_err, args.energy, args.mem_usage]):
if not any([args.usage, args.fb_usage, args.power, args.clock, args.temperature, args.ecc, args.pcie, args.voltage,
args.fan, args.voltage_curve, args.overdrive, args.mem_overdrive, args.perf_level, args.replay_count,
args.xgmi_err, args.energy, args.mem_usage]):
args.usage = args.fb_usage = args.power = args.clock = args.temperature = args.ecc = args.pcie = args.voltage = args.fan = \
args.pcie_usage = args.voltage_curve = args.overdrive = args.mem_overdrive = args.perf_level = \
args.replay_count = args.xgmi_err = args.energy = args.mem_usage = self.all_arguments = True
args.voltage_curve = args.overdrive = args.mem_overdrive = args.perf_level = args.replay_count = args.xgmi_err = \
args.energy = args.mem_usage = self.all_arguments = True
# Add timestamp and store values for specified arguments
values_dict = {}
@@ -704,7 +696,9 @@ class AMDSMICommands():
values_dict['usage'] = engine_usage
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['usage'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.fb_usage:
try:
vram_usage = amdsmi_interface.amdsmi_get_vram_usage(args.gpu)
@@ -720,21 +714,49 @@ class AMDSMICommands():
values_dict['fb_usage'] = vram_usage
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['fb_usage'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.power:
power_dict = {}
try:
average_socket_power = amdsmi_interface.amdsmi_get_power_measure(args.gpu)['average_socket_power']
if self.logger.is_gpuvsmi_compatibility():
pass
power_measure = amdsmi_interface.amdsmi_get_power_measure(args.gpu)
power_dict = {'average_socket_power': power_measure['average_socket_power'],
'voltage_gfx': power_measure['voltage_gfx'],
'voltage_soc': amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info,
'voltage_mem': amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info}
if self.logger.is_human_readable_format():
unit = 'W'
average_socket_power = f"{average_socket_power} {unit}"
power_dict['average_socket_power'] = f"{power_dict['average_socket_power']} W"
power_dict['voltage_gfx'] = f"{power_dict['voltage_gfx']} mV"
power_dict['voltage_soc'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
power_dict['voltage_mem'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
values_dict['power'] = average_socket_power
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
power_dict = {'average_socket_power': e.get_error_info(),
'voltage_gfx': e.get_error_info(),
'voltage_soc': e.get_error_info(),
'voltage_mem': e.get_error_info()}
if not self.all_arguments:
raise e
if self.logger.is_gpuvsmi_compatibility():
power_dict['current_power'] = power_dict.pop('average_socket_power')
power_dict['current_voltage'] = power_dict.pop('voltage_gfx')
power_dict['current_voltage_soc'] = power_dict.pop('voltage_soc')
power_dict['current_voltage_mem'] = power_dict.pop('voltage_mem')
try:
power_dict['current_fan_rpm'] = amdsmi_interface.amdsmi_dev_get_fan_rpms(args.gpu, 0)
if self.logger.is_human_readable_format():
power_dict['current_fan_rpm'] = f"{power_dict['current_fan_rpm']} RPM"
except amdsmi_exception.AmdSmiLibraryException as e:
power_dict['current_fan_rpm'] = e.get_error_info()
if not self.all_arguments:
raise e
values_dict['power'] = power_dict
if args.clock:
try:
clock_gfx = amdsmi_interface.amdsmi_get_clock_measure(args.gpu, amdsmi_interface.AmdSmiClkType.GFX)
@@ -751,7 +773,9 @@ class AMDSMICommands():
values_dict['clock'] = clocks
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['clock'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.temperature:
try:
temperature_edge_current = amdsmi_interface.amdsmi_dev_get_temp_metric(
@@ -761,29 +785,44 @@ class AMDSMICommands():
temperature_vram_current = amdsmi_interface.amdsmi_dev_get_temp_metric(
args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
temperatures = { 'edge': temperature_edge_current,
temperatures = {'edge': temperature_edge_current,
'hotspot': temperature_junction_current,
'mem': temperature_vram_current}
if self.logger.is_gpuvsmi_compatibility():
temperatures = { 'edge_temperature': temperature_edge_current,
temperatures = {'edge_temperature': temperature_edge_current,
'hotspot_temperature': temperature_junction_current,
'mem_temperature': temperature_vram_current}
if self.logger.is_human_readable_format():
unit = '\N{DEGREE SIGN}C'
if self.logger.is_gpuvsmi_compatibility():
unit = 'C'
for temperature_value in temperatures:
temperatures[temperature_value] = f"{temperatures[temperature_value]} {unit}"
values_dict['temperature'] = temperatures
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['temperature'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.ecc:
ecc_dict = {}
try:
values_dict['ecc'] = amdsmi_interface.amdsmi_get_ecc_error_count(args.gpu)
ras_states = amdsmi_interface.amdsmi_get_ras_block_features_enabled(args.gpu)
for state in ras_states:
if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED:
gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']]
ecc_count = amdsmi_interface.amdsmi_get_ecc_error_count(args.gpu, gpu_block)
ecc_dict[state['block']] = {'correctable' : ecc_count['correctable_count'],
'uncorrectable': ecc_count['uncorrectable_count']}
if ecc_dict == {}:
ecc_dict = 'No RAS Blocks Enabled'
values_dict['ecc'] = ecc_dict
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['ecc'] = e.get_error_info()
raise e
if not self.all_arguments:
raise e
if args.pcie:
try:
pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_caps(args.gpu)
@@ -798,7 +837,9 @@ class AMDSMICommands():
values_dict['pcie'] = pcie_link_status
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['pcie'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.voltage:
try:
volt_metric = amdsmi_interface.amdsmi_dev_get_volt_metric(
@@ -810,38 +851,39 @@ class AMDSMICommands():
values_dict['voltage'] = volt_metric
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['voltage'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.fan:
try:
fan_speed = amdsmi_interface.amdsmi_dev_get_fan_speed(args.gpu, 0)
fan_speed_error = False
except amdsmi_exception.AmdSmiLibraryException as e:
fan_speed = e.get_error_info()
fan_speed_error = True
try:
fan_max = amdsmi_interface.amdsmi_dev_get_fan_speed_max(args.gpu, 0)
if isinstance(fan_speed, int) and fan_max > 0:
if not fan_speed_error and fan_max > 0:
fan_percent = round((float(fan_speed) / float(fan_max)) * 100, 2)
if self.logger.is_human_readable_format():
unit = '%'
fan_percent = f"{fan_percent} {unit}"
else:
fan_percent = 'Unable to detect fan speed'
fan_rpm = amdsmi_interface.amdsmi_dev_get_fan_rpms(args.gpu, 0)
values_dict['fan'] = {'speed': fan_speed,
'max' : fan_max,
'rpm' : fan_rpm,
'usage' : fan_percent}
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
if args.pcie_usage:
fan_max = e.get_error_info()
fan_percent = 'Unable to detect fan speed'
try:
pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_status(args.gpu)
if self.logger.is_human_readable_format():
unit ='MT/s'
pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}"
values_dict['pcie_usage'] = pcie_link_status
fan_rpm = amdsmi_interface.amdsmi_dev_get_fan_rpms(args.gpu, 0)
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
fan_rpm = e.get_error_info()
values_dict['fan'] = {'speed': fan_speed,
'max' : fan_max,
'rpm' : fan_rpm,
'usage' : fan_percent}
if args.voltage_curve:
try:
od_volt = amdsmi_interface.amdsmi_dev_get_od_volt_info(args.gpu)
@@ -862,7 +904,6 @@ class AMDSMICommands():
values_dict['voltage_curve'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.overdrive:
try:
overdrive_level = amdsmi_interface.amdsmi_dev_get_overdrive_level(args.gpu)
@@ -873,29 +914,34 @@ class AMDSMICommands():
values_dict['overdrive'] = overdrive_level
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['overdrive'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.mem_overdrive:
values_dict['mem_overdrive'] = amdsmi_interface.AmdSmiRetCode.NOT_IMPLEMENTED
values_dict['mem_overdrive'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
if args.perf_level:
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
values_dict['perf_level'] = perf_level
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['perf_level'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.replay_count:
try:
pci_replay_counter = amdsmi_interface.amdsmi_dev_get_pci_replay_counter(args.gpu)
values_dict['replay_count'] = pci_replay_counter
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['replay_count'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.xgmi_err:
try:
values_dict['xgmi_err'] = amdsmi_interface.amdsmi_dev_xgmi_error_status(args.gpu)
except amdsmi_interface.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.AmdSmiRetCode.ERR_NOT_SUPPORTED:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NOT_SUPPORTED:
values_dict['xgmi_err'] = 'N/A'
else:
elif not self.all_arguments:
raise e
if args.energy:
try:
@@ -907,14 +953,16 @@ class AMDSMICommands():
values_dict['energy'] = energy
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
values_dict['energy'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.mem_usage:
memory_total = {}
try:
memory_total_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM)
memory_total_vis_vram = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
memory_total_gtt = amdsmi_interface.amdsmi_dev_get_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT)
memory_total = {}
# Convert mem_usage to megabytes
memory_total['vram'] = memory_total_vram // (1024*1024)
memory_total['vis_vram'] = memory_total_vis_vram // (1024*1024)
@@ -927,10 +975,36 @@ class AMDSMICommands():
memory_total['vis_vram'] = f"{memory_total['vis_vram']} {unit}"
memory_total['gtt'] = f"{memory_total['gtt']} {unit}"
values_dict['mem_usage'] = memory_total
except amdsmi_exception.AmdSmiLibraryException as e:
raise e
memory_total['vram'] = e.get_error_info()
memory_total['vis_vram'] = e.get_error_info()
memory_total['gtt'] = e.get_error_info()
if not self.all_arguments:
raise e
try:
total_used_vram = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM)
total_used_vis_vram = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
total_used_gtt = amdsmi_interface.amdsmi_dev_get_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT)
# Convert mem_usage to megabytes
memory_total['used_vram'] = total_used_vram // (1024*1024)
memory_total['used_vis_vram'] = total_used_vis_vram // (1024*1024)
memory_total['used_gtt'] = total_used_gtt // (1024*1024)
if self.logger.is_human_readable_format():
memory_total['used_vram'] = f"{memory_total['used_vram']} {unit}"
memory_total['used_vis_vram'] = f"{memory_total['used_vis_vram']} {unit}"
memory_total['used_gtt'] = f"{memory_total['used_gtt']} {unit}"
except amdsmi_exception.AmdSmiLibraryException as e:
memory_total['used_vram'] = e.get_error_info()
memory_total['used_vis_vram'] = e.get_error_info()
memory_total['used_gtt'] = e.get_error_info()
if not self.all_arguments:
raise e
values_dict['mem_usage'] = memory_total
# Store values in logger.output
self.logger.store_output(args.gpu, 'values', values_dict)
@@ -1126,6 +1200,7 @@ class AMDSMICommands():
for thread in threads:
thread.join()
def topology(self, args, multiple_devices=False, gpu=None, access=None,
weight=None, hops=None, type=None, numa=None, numa_bw=None):
""" Get topology information for target gpus
@@ -1196,6 +1271,7 @@ class AMDSMICommands():
if args.numa_bw:
pass
def set_value(self, args, multiple_devices=False, gpu=None, clock=None, sclk=None, mclk=None,
pcie=None, slevel=None, mlevel=None, vc=None, srange=None, mrange=None,
fan=None, perflevel=None, overdrive=None, memoverdrive=None,
@@ -1277,6 +1353,18 @@ class AMDSMICommands():
args.gpu = device_handle
# Build GPU string for errors
try:
gpu_bdf = amdsmi_interface.amdsmi_get_device_bdf(args.gpu)
except amdsmi_exception.AmdSmiLibraryException:
gpu_bdf = f'BDF Unavailable for {args.gpu}'
try:
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
except IndexError:
gpu_id = f'ID Unavailable for {args.gpu}'
gpu_string = f"GPU ID: {gpu_id} BDF:{gpu_bdf}"
# Handle args
if args.clock:
clock_type, freq_bitmask = args.clock
@@ -1284,231 +1372,232 @@ class AMDSMICommands():
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get performance level of {args.gpu}")
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual")
if clock_type != amdsmi_interface.AmdSmiClkType.PCIE.value:
try:
amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
clock_type = amdsmi_interface.AmdSmiClkType(clock_type)
raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}")
print(f'Successfully set frequency bitmask on {args.gpu}')
else:
try:
amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
clock_type = amdsmi_interface.AmdSmiClkType(clock_type)
raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}")
print(f'Successfully set frequency bitmask on {args.gpu}')
if args.sclk:
freq_bitmask = args.sclk
clock_type = amdsmi_interface.AmdSmiClkType.SYS
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get performance level of {args.gpu}")
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual")
try:
amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type.value, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}")
print(f'Successfully set frequency bitmask on {args.gpu}')
if args.mclk:
freq_bitmask = args.sclk
clock_type = amdsmi_interface.AmdSmiClkType.MEM
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get performance level of {args.gpu}")
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual")
try:
amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type.value, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}")
print(f'Successfully set frequency bitmask on {args.gpu}')
if args.pcie:
freq_bitmask = args.sclk
clock_type = amdsmi_interface.AmdSmiClkType.PCIE
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get performance level of {args.gpu}")
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual")
try:
amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the {clock_type} clock frequency on {args.gpu}")
print(f'Successfully set frequency bitmask on {args.gpu}')
if args.slevel:
level, value = args.slevel
level = amdsmi_interface.AmdSmiFreqInd(level).value
clock_type = amdsmi_interface.AmdSmiClkType.SYS
try:
amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {args.gpu}")
print(f'Successfully changed clock frequency on {args.gpu}')
if args.mlevel:
level, value = args.mlevel
level = amdsmi_interface.AmdSmiFreqInd(level).value
clock_type = amdsmi_interface.AmdSmiClkType.MEM
try:
amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {args.gpu}")
print(f'Successfully changed clock frequency on {args.gpu}')
if args.vc:
point, clk, volt = args.vc
try:
amdsmi_interface.amdsmi_dev_set_od_volt_info(args.gpu, point, clk, volt)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the Voltage Curve point {point} to {clk}(MHz) {volt}(mV) on {args.gpu}")
print(f'Successfully set voltage point {point} to {clk}(MHz) {volt}(mV) on {args.gpu}')
if args.srange:
min_value, max_value = args.srange
clock_type = amdsmi_interface.AmdSmiClkType.SYS
try:
amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}")
print(f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}")
if args.mrange:
min_value, max_value = args.srange
clock_type = amdsmi_interface.AmdSmiClkType.MEM
try:
amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type.value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}")
print(f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {args.gpu}")
if args.fan:
try:
amdsmi_interface.amdsmi_dev_set_fan_speed(args.gpu, 0, args.fan)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set fan speed {args.fan} on {args.gpu}")
print(f"Successfully set fan speed {args.fan} on {args.gpu}")
if args.perflevel:
perf_levels = amdsmi_interface.amdsmi_wrapper.amdsmi_dev_perf_level_t__enumvalues
for value in perf_levels:
if args.perflevel.lower() in perf_levels[value]:
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, value)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set performance level {args.perflevel} on {args.gpu}")
print(f"Successfully set performance level {args.perflevel} on {args.gpu}")
break
if args.overdrive or args.overdrive == 0:
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get performance level of {args.gpu}")
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual")
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_dev_set_overdrive_level_v1(args.gpu, args.overdrive)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set overdrive {args.overdrive} to {args.gpu}")
print(f"Successfully to set overdrive {args.overdrive} to {args.gpu}")
if clock_type != amdsmi_interface.AmdSmiClkType.PCIE:
try:
amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
else:
try:
amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
if args.memoverdrive or args.memoverdrive == 0:
self.logger.store_output(args.gpu, 'clock', f'Successfully set clock frequency bitmask for {clock_type}')
if isinstance(args.sclk, int):
freq_bitmask = args.sclk
clock_type = amdsmi_interface.AmdSmiClkType.SYS
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get performance level of {args.gpu}")
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL.value)
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set the performance level of {args.gpu} to manual")
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
if args.poweroverdrive:
try:
amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
self.logger.store_output(args.gpu, 'sclk', 'Successfully set clock frequency bitmask')
if isinstance(args.mclk, int):
freq_bitmask = args.mclk
clock_type = amdsmi_interface.AmdSmiClkType.MEM
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
self.logger.store_output(args.gpu, 'mclk', 'Successfully set clock frequency bitmask')
if isinstance(args.pcie, int):
freq_bitmask = args.pcie
clock_type = amdsmi_interface.AmdSmiClkType.PCIE
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
self.logger.store_output(args.gpu, 'pcie', 'Successfully set clock frequency bitmask')
if isinstance(args.slevel, int):
level, value = args.slevel
level = amdsmi_interface.AmdSmiFreqInd(level)
clock_type = amdsmi_interface.AmdSmiClkType.SYS
try:
amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e
self.logger.store_output(args.gpu, 'slevel', 'Successfully changed clock frequency')
if isinstance(args.mlevel, int):
level, value = args.mlevel
level = amdsmi_interface.AmdSmiFreqInd(level)
clock_type = amdsmi_interface.AmdSmiClkType.MEM
try:
amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e
self.logger.store_output(args.gpu, 'mlevel', 'Successfully changed clock frequency')
if isinstance(args.vc, int):
point, clk, volt = args.vc
try:
amdsmi_interface.amdsmi_dev_set_od_volt_info(args.gpu, point, clk, volt)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the Voltage Curve point {point} to {clk}(MHz) {volt}(mV) on {gpu_string}") from e
self.logger.store_output(args.gpu, 'vc', f'Successfully set voltage point {point} to {clk}(MHz) {volt}(mV)')
if isinstance(args.srange, int):
min_value, max_value = args.srange
clock_type = amdsmi_interface.AmdSmiClkType.SYS
try:
amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e
self.logger.store_output(args.gpu, 'srange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)")
if isinstance(args.mrange, int):
min_value, max_value = args.srange
clock_type = amdsmi_interface.AmdSmiClkType.MEM
try:
amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e
self.logger.store_output(args.gpu, 'mrange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)")
if isinstance(args.fan, int):
try:
amdsmi_interface.amdsmi_dev_set_fan_speed(args.gpu, 0, args.fan)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}")
if args.perflevel:
perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perflevel]
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, perf_level)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perflevel}")
if isinstance(args.overdrive, int):
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_dev_set_overdrive_level_v1(args.gpu, args.overdrive)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set overdrive {args.overdrive} to {gpu_string}") from e
self.logger.store_output(args.gpu, 'overdrive', f"Successfully to set overdrive level to {args.overdrive}")
if isinstance(args.memoverdrive, int):
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
self.logger.store_output(args.gpu, 'memoverdrive', f"Successfully to set memoverdrive level to {args.memoverdrive}")
if isinstance(args.poweroverdrive, int):
overdrive_power_cap = args.poweroverdrive
try:
power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get the power cap info for {args.gpu}")
raise ValueError(f"Unable to get the power cap info for {gpu_string}") from e
if overdrive_power_cap == 0:
overdrive_power_cap = power_caps['power_cap_default']
else:
overdrive_power_cap *= 1000000
if overdrive_power_cap < power_caps['min_power_cap']:
raise ValueError(self, f"Requested power cap: {overdrive_power_cap} is lower than the min power cap: {power_caps['min_power_cap']}")
raise ValueError(f"Requested power cap: {overdrive_power_cap} is lower than the min power cap: {power_caps['min_power_cap']}")
if overdrive_power_cap > power_caps['max_power_cap']:
raise ValueError(self, f"Requested power cap: {overdrive_power_cap} is greater than the max power cap: {power_caps['max_power_cap']}")
raise ValueError(f"Requested power cap: {overdrive_power_cap} is greater than the max power cap: {power_caps['max_power_cap']}")
if overdrive_power_cap == power_caps['power_cap']:
raise ValueError(self, f"Requested power cap: {overdrive_power_cap} is the same as the current power cap: {power_caps['power_cap']}")
raise ValueError(f"Requested power cap: {overdrive_power_cap} is the same as the current power cap: {power_caps['power_cap']}")
try:
amdsmi_interface.amdsmi_dev_set_power_cap(args.gpu, 0, overdrive_power_cap)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set power cap to {overdrive_power_cap} on {args.gpu}")
raise ValueError(f"Unable to set power cap to {overdrive_power_cap} on {gpu_string}") from e
try:
power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get the power cap info for {args.gpu} post set")
raise ValueError(f"Unable to get the power cap info for {gpu_string} post set") from e
if power_caps['power_cap'] == overdrive_power_cap:
print(f"Successfully set the power cap {overdrive_power_cap} on {args.gpu}")
self.logger.store_output(args.gpu, 'power_cap', f"Successfully set the power cap {overdrive_power_cap}")
else:
raise ValueError(self, f"Power cap: {overdrive_power_cap} set failed on {args.gpu}")
raise ValueError(f"Power cap: {overdrive_power_cap} set failed on {gpu_string}")
if args.profile:
print(amdsmi_interface.AmdSmiRetCode.NOT_IMPLEMENTED)
if args.perfdeterminism:
self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented")
if isinstance(args.perfdeterminism, int):
try:
amdsmi_interface.amdsmi_set_perf_determinism_mode(args.gpu, args.perfdeterminism)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {args.gpu}")
print(f"Successfully enabled performance determinism and set GFX clock frequency to {args.perfdeterminism} on {args.gpu}")
raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perfdeterminism}")
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output()
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
@@ -1652,7 +1741,7 @@ class AMDSMICommands():
def rocm_smi(self, args):
print("Placeholder for rocm-smi legacy commandss")
print("Placeholder for rocm-smi legacy commands")
def _event_thread(self, commands, i):
+43 -17
Просмотреть файл
@@ -22,6 +22,7 @@
import logging
import platform
import sys
import time
from pathlib import Path
@@ -118,6 +119,20 @@ class AMDSMIHelpers():
return self._is_windows
def get_output_format(self):
"""Returns the output format read from sys.argv
Returns:
str: outputformat
"""
args = sys.argv[1:]
outputformat = "human"
if "--json" in args or "--j" in args:
outputformat = "json"
elif "--csv" in args or "--c" in args:
outputformat = "csv"
return outputformat
def get_gpu_choices(self):
"""Return dictionary of possible GPU choices and string of the output:
Dictionary will be in format: gpus[ID] : (BDF, UUID, Device Handle)
@@ -307,11 +322,34 @@ class AMDSMIHelpers():
return asic_info['vendor_id'] == AMD_VENDOR_ID
def is_valid_clock_type(self, clock_type):
if clock_type in amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues:
return True, amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues.keys()
else:
return False, amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues.keys()
def get_perf_levels(self):
perf_levels_str = [clock.name for clock in amdsmi_interface.AmdSmiDevPerfLevel]
perf_levels_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiDevPerfLevel))
return perf_levels_str, perf_levels_int
def get_clock_types(self):
clock_types_str = [clock.name for clock in amdsmi_interface.AmdSmiClkType]
clock_types_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiClkType))
return clock_types_str, clock_types_int
def validate_clock_type(self, input_clock_type):
valid_clock_types_str, valid_clock_types_int = self.get_clock_types()
valid_clock_input = False
if isinstance(input_clock_type, str):
for clock_type in valid_clock_types_str:
if input_clock_type.lower() == clock_type.lower():
input_clock_type = clock_type # Set input_clock_type to enum value in AmdSmiClkType
valid_clock_input = True
break
elif isinstance(input_clock_type, int):
if input_clock_type in valid_clock_types_int:
input_clock_type = amdsmi_interface.AmdSmiClkType(input_clock_type)
valid_clock_input = True
return valid_clock_input, input_clock_type
def confirm_out_of_spec_warning(self, auto_respond=False):
@@ -348,15 +386,3 @@ class AMDSMIHelpers():
return True, profile_presets[profile]
else:
return False, profile_presets.values()
def get_perf_level(self, device_handle):
""" Return the current performance level of a given device
@param device_handle: DRM device identifier
"""
try:
ret = amdsmi_interface.amdsmi_dev_get_perf_level(device_handle)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(self, f"Unable to get performance level of {device_handle}")
+101 -115
Просмотреть файл
@@ -31,20 +31,26 @@ import sys
from _version import __version__
from amdsmi_helpers import AMDSMIHelpers
import amdsmi_cli_exceptions
from BDF import BDF
class AMDSMIParser(argparse.ArgumentParser):
"""Unified Parser for AMDSMI CLI.
This parser doesn't access amdsmi's lib directly,but via AMDSMIHelpers,
this allows for us to use this parser with future OS & Platform integration.
Args:
argparse (ArgumentParser): argparse.ArgumentParser
"""
def __init__(self, version, discovery, static, firmware, bad_pages, metric,
process, profile, event, topology, set_value, reset, rocmsmi):
# Helper variables
self.amdsmi_helpers = AMDSMIHelpers()
self.gpu_choices, self.gpu_choices_str = self.amdsmi_helpers.get_gpu_choices()
self.helpers = AMDSMIHelpers()
self.gpu_choices, self.gpu_choices_str = self.helpers.get_gpu_choices()
self.vf_choices = ['3', '2', '1']
version_string = f"Version: {__version__}"
platform_string = f"Platform: {self.amdsmi_helpers.os_info()}"
platform_string = f"Platform: {self.helpers.os_info()}"
# Adjust argument parser options
super().__init__(
@@ -83,12 +89,7 @@ class AMDSMIParser(argparse.ArgumentParser):
if int_value.isdigit(): # Is digit works only on positive numbers
return int(int_value)
else:
args = sys.argv[1:]
outputformat = "human"
if "--json" in args or "--j" in args:
outputformat = "json"
elif "--csv" in args or "-c" in args:
outputformat = "csv"
outputformat = self.helpers.get_output_format()
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
@@ -100,6 +101,7 @@ class AMDSMIParser(argparse.ArgumentParser):
If the path is a file and it doesn't exist create and return the file path
"""
class CheckOutputFilePath(argparse.Action):
outputformat = self.helpers.get_output_format()
# Checks the values
def __call__(self, parser, args, values, option_string=None):
path = Path(values)
@@ -107,13 +109,7 @@ class AMDSMIParser(argparse.ArgumentParser):
if path.parent.is_dir():
path.touch()
else:
args = sys.argv[1:]
outputformat = "human"
if "--json" in args or "--j" in args:
outputformat = "json"
elif "--csv" in args or "-c" in args:
outputformat = "csv"
raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, outputformat)
raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat)
if path.is_dir():
path = path / f"{int(time.time())}-amdsmi-output.txt"
@@ -122,13 +118,7 @@ class AMDSMIParser(argparse.ArgumentParser):
elif path.is_file():
setattr(args, self.dest, path)
else:
args = sys.argv[1:]
outputformat = "human"
if "--json" in args or "--j" in args:
outputformat = "json"
elif "--csv" in args or "-c" in args:
outputformat = "csv"
raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, outputformat)
raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat)
return CheckOutputFilePath
@@ -178,8 +168,9 @@ class AMDSMIParser(argparse.ArgumentParser):
If 1 or more device handles are not found then raise an ArgumentError for the first invalid gpu seen
"""
amdsmi_helpers = self.amdsmi_helpers
amdsmi_helpers = self.helpers
class _GPUSelectAction(argparse.Action):
ouputformat=self.helpers.get_output_format()
# Checks the values
def __call__(self, parser, args, values, option_string=None):
status, selected_device_handles = amdsmi_helpers.get_device_handles_from_gpu_selections(gpu_selections=values,
@@ -187,17 +178,10 @@ class AMDSMIParser(argparse.ArgumentParser):
if status:
setattr(args, self.dest, selected_device_handles)
else:
invalid_selection = selected_device_handles
args = sys.argv[1:]
outputformat = "human"
if "--json" in args or "--j" in args:
outputformat = "json"
elif "--csv" in args or "-c" in args:
outputformat = "csv"
if invalid_selection == '':
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", outputformat)
if selected_device_handles == '':
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", _GPUSelectAction.ouputformat)
else:
raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(invalid_selection, outputformat)
raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, _GPUSelectAction.ouputformat)
return _GPUSelectAction
@@ -221,6 +205,21 @@ class AMDSMIParser(argparse.ArgumentParser):
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
def _add_watch_arguments(self, subcommand_parser):
# Device arguments help text
watch_help = "Reprint the command in a loop of Interval seconds"
watch_time_help = "The total time to watch the given command"
iterations_help = "Total number of iterations to loop on the given command"
# Mutually Exclusive Args within the subparser
subcommand_parser.add_argument('-w', '--watch', action='store', metavar='loop_time',
type=self._positive_int, required=False, help=watch_help)
subcommand_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='total_loop_time',
type=self._positive_int, required=False, help=watch_time_help)
subcommand_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='number_of_iterations',
type=self._positive_int, required=False, help=iterations_help)
def _add_device_arguments(self, subcommand_parser, required=False):
# Device arguments help text
gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}"
@@ -232,7 +231,7 @@ class AMDSMIParser(argparse.ArgumentParser):
device_args.add_argument('-g', '--gpu', action=self._gpu_select(self.gpu_choices),
nargs='+', help=gpu_help)
if self.amdsmi_helpers.is_hypervisor():
if self.helpers.is_hypervisor():
device_args.add_argument('-v', '--vf', action='store', nargs='+',
help=vf_help, choices=self.vf_choices)
@@ -313,13 +312,13 @@ class AMDSMIParser(argparse.ArgumentParser):
static_parser.add_argument('-c', '--caps', action='store_true', required=False, help=caps_help)
# Options to display on Hypervisors and Baremetal
if self.amdsmi_helpers.is_hypervisor() or self.amdsmi_helpers.is_baremetal():
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
if self.amdsmi_helpers.is_linux():
if self.helpers.is_linux():
static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help)
# Options to only display on a Hypervisor
if self.amdsmi_helpers.is_hypervisor():
if self.helpers.is_hypervisor():
static_parser.add_argument('-d', '--dfc-ucode', action='store_true', required=False, help=dfc_help)
static_parser.add_argument('-f', '--fb-info', action='store_true', required=False, help=fb_help)
static_parser.add_argument('-n', '--num-vf', action='store_true', required=False, help=num_vf_help)
@@ -349,12 +348,12 @@ class AMDSMIParser(argparse.ArgumentParser):
firmware_parser.add_argument('-f', '--ucode-list', '--fw-list', dest='fw_list', action='store_true', required=False, help=fw_list_help, default=True)
# Options to only display on a Hypervisor
if self.amdsmi_helpers.is_hypervisor():
if self.helpers.is_hypervisor():
firmware_parser.add_argument('-e', '--error-records', action='store_true', required=False, help=err_records_help)
def _add_bad_pages_parser(self, subparsers, func):
if not (self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()):
if not (self.helpers.is_baremetal() and self.helpers.is_linux()):
# The bad_pages subcommand is only applicable to Linux Baremetal systems
return
@@ -369,7 +368,7 @@ class AMDSMIParser(argparse.ArgumentParser):
un_res_help = "Displays unreservable pages"
# Create bad_pages subparser
bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help, aliases=['bad_pages'])
bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help)
bad_pages_parser._optionals.title = bad_pages_optionals_title
bad_pages_parser.formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=80, width=90)
bad_pages_parser.set_defaults(func=func)
@@ -393,9 +392,6 @@ class AMDSMIParser(argparse.ArgumentParser):
# Optional arguments help text
usage_help = "Displays engine usage information"
watch_help = "Reprint the command in a loop of Interval seconds"
watch_time_help = "The total time to watch the given command"
iterations_help = "Total number of iterations to loop on the given command"
# Help text for Arguments only Available on Virtual OS and Baremetal platforms
fb_usage_help = "Total and used framebuffer"
@@ -410,7 +406,6 @@ class AMDSMIParser(argparse.ArgumentParser):
# Help text for Arguments only on Linux Baremetal platforms
fan_help = "Current fan speed"
pcie_usage_help = "Estimated PCIe link usage"
vc_help = "Display voltage curve"
overdrive_help = "Current GPU clock overdrive level"
mo_help = "Current memory clock overdrive level"
@@ -435,21 +430,18 @@ class AMDSMIParser(argparse.ArgumentParser):
# Add Device args
self._add_device_arguments(metric_parser, required=False)
# Add Watch args
self._add_watch_arguments(metric_parser)
# Optional Args
metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help)
metric_parser.add_argument('-w', '--watch', action='store', metavar='Interval',
type=self._positive_int, required=False, help=watch_help)
metric_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='Duration',
type=self._positive_int, required=False, help=watch_time_help)
metric_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='Iterations',
type=self._positive_int, required=False, help=iterations_help)
# Optional Args for Virtual OS and Baremetal systems
if self.amdsmi_helpers.is_virtual_os() or self.amdsmi_helpers.is_baremetal():
if self.helpers.is_virtual_os() or self.helpers.is_baremetal():
metric_parser.add_argument('-b', '--fb-usage', action='store_true', required=False, help=fb_usage_help)
# Optional Args for Hypervisors and Baremetal systems
if self.amdsmi_helpers.is_hypervisor() or self.amdsmi_helpers.is_baremetal():
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help)
metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help)
metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
@@ -458,9 +450,8 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help)
# Optional Args for Linux Baremetal Systems
if self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux():
if self.helpers.is_baremetal() and self.helpers.is_linux():
metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help)
metric_parser.add_argument('-s', '--pcie-usage', action='store_true', required=False, help=pcie_usage_help)
metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help)
metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help)
metric_parser.add_argument('-M', '--mem-overdrive', action='store_true', required=False, help=mo_help)
@@ -471,14 +462,14 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-m', '--mem-usage', action='store_true', required=False, help=mem_usage_help)
# Options to only display to Hypervisors
if self.amdsmi_helpers.is_hypervisor():
if self.helpers.is_hypervisor():
metric_parser.add_argument('-s', '--schedule', action='store_true', required=False, help=schedule_help)
metric_parser.add_argument('-G', '--guard', action='store_true', required=False, help=guard_help)
metric_parser.add_argument('-u', '--guest', action='store_true', required=False, help=guest_help)
def _add_process_parser(self, subparsers, func):
if self.amdsmi_helpers.is_hypervisor():
if self.helpers.is_hypervisor():
# Don't add this subparser on Hypervisors
# This subparser is only available to Guest and Baremetal systems
return
@@ -495,9 +486,7 @@ class AMDSMIParser(argparse.ArgumentParser):
pid_help = "Gets all process information about the specified process based on Process ID"
name_help = "Gets all process information about the specified process based on Process Name.\
\nIf multiple processes have the same name information is returned for all of them."
watch_help = "Reprint the command in a loop of Interval seconds"
watch_time_help = "The total time to watch the given command"
iterations_help = "Total number of iterations to loop on the given command"
# Create process subparser
process_parser = subparsers.add_parser('process', help=process_help, description=process_subcommand_help)
@@ -509,21 +498,18 @@ class AMDSMIParser(argparse.ArgumentParser):
# Add Device args
self._add_device_arguments(process_parser, required=False)
# Add Watch args
self._add_watch_arguments(process_parser)
# Optional Args
process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help)
process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help)
process_parser.add_argument('-p', '--pid', action='store', type=self._positive_int, required=False, help=pid_help)
process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help)
process_parser.add_argument('-w', '--watch', action='store', metavar='Interval',
type=self._positive_int, required=False, help=watch_help)
process_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='Duration',
type=self._positive_int, required=False, help=watch_time_help)
process_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='Iterations',
type=self._positive_int, required=False, help=iterations_help)
def _add_profile_parser(self, subparsers, func):
if not (self.amdsmi_helpers.is_windows() and self.amdsmi_helpers.is_hypervisor()):
if not (self.helpers.is_windows() and self.helpers.is_hypervisor()):
# This subparser only applies to Hypervisors
return
@@ -544,7 +530,7 @@ class AMDSMIParser(argparse.ArgumentParser):
def _add_event_parser(self, subparsers, func):
if self.amdsmi_helpers.is_linux() and not self.amdsmi_helpers.is_virtual_os():
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
# This subparser only applies to Linux BareMetal & Linux Hypervisors, NOT Linux Guest
return
@@ -566,7 +552,7 @@ class AMDSMIParser(argparse.ArgumentParser):
def _add_topology_parser(self, subparsers, func):
return
if not(self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()):
if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
# This subparser is only applicable to Baremetal Linux
return
@@ -599,11 +585,11 @@ class AMDSMIParser(argparse.ArgumentParser):
topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help)
topology_parser.add_argument('-t', '--type', action='store_true', required=False, help=type_help)
topology_parser.add_argument('-n', '--numa', action='store_true', required=False, help=numa_help)
topology_parser.add_argument('-b', '--numa_bw', action='store_true', required=False, help=numa_bw_help)
topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help)
def _add_set_value_parser(self, subparsers, func):
if not(self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()):
if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
# This subparser is only applicable to Baremetal Linux
return
@@ -628,7 +614,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_mem_overdrive_help = "Set memory overclock overdrive level ***DEPRECATED IN NEWER KERNEL VERSIONS (use --mlevel instead)***"
set_power_overdrive_help = "Set the maximum GPU power using power overdrive in Watts"
set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes"
set_perf_det_help = "Set GPU clock frequency limit to get minimal performance variation"
set_perf_det_help = "Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation"
# Create set_value subparser
set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help)
@@ -641,7 +627,7 @@ class AMDSMIParser(argparse.ArgumentParser):
self._add_device_arguments(set_value_parser, required=True)
# Optional Args
set_value_parser.add_argument('-c', '--clock', action=self._validate_set_clock(True), nargs='+', type=self._positive_int, required=False, help=set_clock_help, metavar=('CLK_TYPE', 'CLK_LEVELS'))
set_value_parser.add_argument('-c', '--clock', action=self._validate_set_clock(True), nargs='+', required=False, help=set_clock_help, metavar=('CLK_TYPE', 'CLK_LEVELS'))
set_value_parser.add_argument('-s', '--sclk', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_sclk_help, metavar='CLK_LEVELS')
set_value_parser.add_argument('-m', '--mclk', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_mclk_help, metavar='CLK_LEVELS')
set_value_parser.add_argument('-p', '--pcie', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_pcie_help, metavar='CLK_LEVELS')
@@ -651,7 +637,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_value_parser.add_argument('-r', '--srange', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_srange_help, metavar=('SCLKMIN', 'SCLKMAX'))
set_value_parser.add_argument('-R', '--mrange', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_mrange_help, metavar=('MCLKMIN', 'MCLKMAX'))
set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%')
set_value_parser.add_argument('-l', '--perflevel', action='store', choices=['auto', 'low', 'high', 'manual'], required=False, help=set_perf_level_help, metavar='LEVEL')
set_value_parser.add_argument('-l', '--perflevel', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
set_value_parser.add_argument('-o', '--overdrive', action=self._validate_overdrive_percent(), required=False, help=set_overdrive_help, metavar='%')
set_value_parser.add_argument('-O', '--memoverdrive', action=self._validate_overdrive_percent(), required=False, help=set_mem_overdrive_help, metavar='%')
set_value_parser.add_argument('-w', '--poweroverdrive', action=self._prompt_spec_warning(), type=self._positive_int, required=False, help=set_power_overdrive_help, metavar="WATTS")
@@ -661,13 +647,14 @@ class AMDSMIParser(argparse.ArgumentParser):
def _validate_set_clock(self, validate_clock_type=True):
""" Validate Clock input"""
amdsmi_helpers = self.amdsmi_helpers
amdsmi_helpers = self.helpers
class _ValidateClockType(argparse.Action):
# Checks the values
# Checks the clock type and clock values
def __call__(self, parser, args, values, option_string=None):
if validate_clock_type:
clock_type = values[0]
valid_clock_type, clock_types = amdsmi_helpers.is_valid_clock_type(clock_type=clock_type)
clock_types = amdsmi_helpers.get_clock_types()[0]
valid_clock_type, amdsmi_clock_type = amdsmi_helpers.validate_clock_type(input_clock_type=clock_type)
if not valid_clock_type:
raise argparse.ArgumentError(self, f"Invalid argument: '{clock_type}' needs to be a valid clock type:{clock_types}")
@@ -682,7 +669,7 @@ class AMDSMIParser(argparse.ArgumentParser):
freq_bitmask |= (1 << level)
if validate_clock_type:
setattr(args, self.dest, (clock_type, freq_bitmask))
setattr(args, self.dest, (amdsmi_clock_type, freq_bitmask))
else:
setattr(args, self.dest, freq_bitmask)
return _ValidateClockType
@@ -690,7 +677,7 @@ class AMDSMIParser(argparse.ArgumentParser):
def _prompt_spec_warning(self):
""" Prompt out of spec warning"""
amdsmi_helpers = self.amdsmi_helpers
amdsmi_helpers = self.helpers
class _PromptSpecWarning(argparse.Action):
# Checks the values
def __call__(self, parser, args, values, option_string=None):
@@ -701,57 +688,58 @@ class AMDSMIParser(argparse.ArgumentParser):
def _validate_fan_speed(self):
""" Validate fan speed input"""
amdsmi_helpers = self.amdsmi_helpers
amdsmi_helpers = self.helpers
class _ValidateFanSpeed(argparse.Action):
# Checks the values
def __call__(self, parser, args, values, option_string=None):
# Convert percentage to fan level
if isinstance(values, str):
try:
values = int(values[:-1]) // 100 * 255
except ValueError as e:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%")
# Store the fan level as fan_speed
if isinstance(values, int):
if 0 <= values <= 255:
amdsmi_helpers.confirm_out_of_spec_warning()
setattr(args, self.dest, values)
else:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%")
# Convert percentage to fan level
if '%' in values:
try:
amdsmi_helpers.confirm_out_of_spec_warning()
values = int(int(values[:-1]) / 100 * 255)
setattr(args, self.dest, values)
except ValueError as e:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-100%")
else: # Store the fan level as fan_speed
values = int(values)
if 0 <= values <= 255:
amdsmi_helpers.confirm_out_of_spec_warning()
setattr(args, self.dest, values)
else:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255")
else:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%")
return _ValidateFanSpeed
def _validate_overdrive_percent(self):
""" Validate overdrive percentage input"""
amdsmi_helpers = self.amdsmi_helpers
amdsmi_helpers = self.helpers
class _ValidateOverdrivePercent(argparse.Action):
# Checks the values
def __call__(self, parser, args, values, option_string=None):
if isinstance(values, str):
try:
if values[-1] == '%':
values = int(values[:-1])
over_drive_percent = int(values[:-1])
else:
values = int(values)
except ValueError as e:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%")
over_drive_percent = int(values)
if isinstance(values, int):
if 0 <= values <= 20:
over_drive_percent = values
else:
if 0 <= over_drive_percent <= 20:
amdsmi_helpers.confirm_out_of_spec_warning()
setattr(args, self.dest, over_drive_percent)
else:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be within range 0-20 or 0-20%")
except ValueError:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%")
amdsmi_helpers.confirm_out_of_spec_warning()
setattr(args, self.dest, over_drive_percent)
else:
raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%")
return _ValidateOverdrivePercent
def _add_reset_parser(self, subparsers, func):
if not(self.amdsmi_helpers.is_baremetal() and self.amdsmi_helpers.is_linux()):
if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
# This subparser is only applicable to Baremetal Linux
return
@@ -823,6 +811,7 @@ class AMDSMIParser(argparse.ArgumentParser):
rocm_smi_parser.add_argument('-l', '--load', action=self._check_input_file_path(), type=str, required=False, help=load_help)
rocm_smi_parser.add_argument('-s', '--save', action=self._check_output_file_path(), type=str, required=False, help=save_help)
rocm_smi_parser.add_argument('-b', '--showbw', action='store_true', required=False, help=showbw_help)
rocm_smi_parser.add_argument('-t', '--showtempgraph', action='store_true', required=False, help=showtempgraph_help)
rocm_smi_parser.add_argument('-m', '--showmclkrange', action='store_true', required=False, help=showmclkrange_help)
rocm_smi_parser.add_argument('-c', '--showsclkrange', action='store_true', required=False, help=showsclkrange_help)
@@ -832,13 +821,10 @@ class AMDSMIParser(argparse.ArgumentParser):
rocm_smi_parser.add_argument('-v', '--showclkvolt', action='store_true', required=False, help=showclkvolt_help)
rocm_smi_parser.add_argument('-f', '--showclkfrq', action='store_true', required=False, help=showclkfrq_help)
def error(self, message):
args = sys.argv[1:]
outputformat = "human"
if "--json" in args or "--j" in args:
outputformat = "json"
elif "--csv" in args or "-c" in args:
outputformat = "csv"
outputformat = self.helpers.get_output_format()
if "argument : invalid choice: " in message:
l = len("argument : invalid choice: ") + 1
message = message[l:]
+1 -1
Просмотреть файл
@@ -1 +1 @@
__version__ = "0.0.2"
__version__ = "0.0.3"
+1 -1
Просмотреть файл
@@ -2437,7 +2437,7 @@ def amdsmi_dev_get_ecc_count(
ec = amdsmi_wrapper.amdsmi_error_count_t()
_check_res(
amdsmi_wrapper. amdsmi_dev_get_ecc_count(
amdsmi_wrapper.amdsmi_dev_get_ecc_count(
device_handle, block, ctypes.byref(ec))
)
+6 -7
Просмотреть файл
@@ -168,17 +168,17 @@ def char_pointer_cast(string, encoding='utf-8'):
_libraries = {}
from pathlib import Path
libamd_smi_optrocm = Path(__file__).parents[3] / "/lib/libamd_smi.so"
libamd_smi_cpack = Path("@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/libamd_smi.so")
libamd_smi_optrocm = Path("/opt/rocm/lib/libamd_smi.so")
libamd_smi_parent_dir = Path(__file__).resolve().parent / "libamd_smi.so"
libamd_smi_cwd = Path.cwd()
libamd_smi_cwd = Path.cwd() / "libamd_smi.so"
if libamd_smi_cpack.is_file():
# try to find library in install directory provided by CMake
_libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_cpack)
elif libamd_smi_optrocm.is_file():
if libamd_smi_optrocm.is_file():
# try /opt/rocm/lib as a fallback
_libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_optrocm)
elif libamd_smi_cpack.is_file():
# try to find library in install directory provided by CMake
_libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_cpack)
elif libamd_smi_parent_dir.is_file():
# try to fall back to parent directory
_libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_parent_dir)
@@ -187,7 +187,6 @@ else:
_libraries['libamd_smi.so'] = ctypes.CDLL(libamd_smi_cwd)
# values for enumeration 'c__EA_amdsmi_init_flags_t'
c__EA_amdsmi_init_flags_t__enumvalues = {
0: 'AMDSMI_INIT_ALL_DEVICES',
+1 -1
Просмотреть файл
@@ -10,7 +10,7 @@ name = "amdsmi"
authors = [
{name = "AMD", email = "amd-smi.support@amd.com"},
]
version = '0.1'
version = '0.3'
license = {file = "amdsmi/LICENSE"}
readme = {file = "amdsmi/README.md", content-type = "text/markdown"}
description = "SMI LIB - AMD GPU Monitoring Library"
+6 -6
Просмотреть файл
@@ -106,17 +106,17 @@ def main():
library_path = os.path.join(os.path.dirname(__file__), library)
line_to_replace = "_libraries['{}'] = ctypes.CDLL('{}')".format(library_name, library_path)
new_line = f"""from pathlib import Path
libamd_smi_optrocm = Path(__file__).parents[3] / "/lib/{library_name}"
libamd_smi_cpack = Path("@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/{library_name}")
libamd_smi_optrocm = Path("/opt/rocm/lib/{library_name}")
libamd_smi_parent_dir = Path(__file__).resolve().parent / "{library_name}"
libamd_smi_cwd = Path.cwd()
libamd_smi_cwd = Path.cwd() / "{library_name}"
if libamd_smi_cpack.is_file():
# try to find library in install directory provided by CMake
_libraries['{library_name}'] = ctypes.CDLL(libamd_smi_cpack)
elif libamd_smi_optrocm.is_file():
if libamd_smi_optrocm.is_file():
# try /opt/rocm/lib as a fallback
_libraries['{library_name}'] = ctypes.CDLL(libamd_smi_optrocm)
elif libamd_smi_cpack.is_file():
# try to find library in install directory provided by CMake
_libraries['{library_name}'] = ctypes.CDLL(libamd_smi_cpack)
elif libamd_smi_parent_dir.is_file():
# try to fall back to parent directory
_libraries['{library_name}'] = ctypes.CDLL(libamd_smi_parent_dir)