Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I3f98829e988468d657f280db6765f2f5e28ff5f1


[ROCm/amdsmi commit: d5ad387252]
Этот коммит содержится в:
Maisam Arif
2023-07-31 09:31:40 -05:00
коммит произвёл Maisam Arif
родитель 907fcc53a1
Коммит 9f4faaabd8
4 изменённых файлов: 29 добавлений и 390 удалений
+4 -23
Просмотреть файл
@@ -185,8 +185,8 @@ amd-smi metric --help
usage: amd-smi metric [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-g GPU [GPU ...]]
[-w loop_time] [-W total_loop_time] [-i number_of_iterations] [-u]
[-b] [-p] [-c] [-t] [-e] [-P] [-V] [-f] [-C] [-o] [-M] [-l] [-r]
[-x] [-E] [-m]
[-b] [-p] [-c] [-t] [-e] [-P] [-f] [-C] [-o] [-l] [-r] [-x]
[-E] [-m]
If no GPU is specified, returns metric information for all GPUs on the system.
If no metric argument is provided all metric information will be displayed.
@@ -205,11 +205,9 @@ Metric arguments:
-t, --temperature Current temperatures
-e, --ecc Number of ECC errors
-P, --pcie Current PCIe speed and width
-V, --voltage Current GPU voltages
-f, --fan Current fan speed
-C, --voltage-curve Display voltage curve
-o, --overdrive Current GPU clock overdrive level
-M, --mem-overdrive Current memory clock overdrive level
-l, --perf-level Current DPM performance level
-r, --replay-count PCIe replay count
-x, --xgmi-err XGMI error information since last read
@@ -283,11 +281,7 @@ Command Modifiers:
amd-smi set --help
usage: amd-smi set [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...]
[-c CLK_TYPE [CLK_LEVELS ...]] [-s CLK_LEVELS [CLK_LEVELS ...]]
[-m CLK_LEVELS [CLK_LEVELS ...]] [-p CLK_LEVELS [CLK_LEVELS ...]]
[-S SCLKLEVEL SCLK] [-M MCLKLEVEL MCLK] [-V POINT SCLK SVOLT]
[-r SCLKMIN SCLKMAX] [-R MCLKMIN MCLKMAX] [-f %] [-l LEVEL] [-o %]
[-O %] [-w WATTS] [-P SETPROFILE] [-d SCLKMAX]
[-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX]
A GPU must be specified to set a configuration.
A set argument must be provided; Multiple set arguments are accepted
@@ -296,20 +290,8 @@ Set Arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-c CLK_TYPE [CLK_LEVELS ...], --clock CLK_TYPE [CLK_LEVELS ...] Sets clock frequency levels for specified clocks
-s CLK_LEVELS [CLK_LEVELS ...], --sclk CLK_LEVELS [CLK_LEVELS ...] Sets GPU clock frequency levels
-m CLK_LEVELS [CLK_LEVELS ...], --mclk CLK_LEVELS [CLK_LEVELS ...] Sets memory clock frequency levels
-p CLK_LEVELS [CLK_LEVELS ...], --pcie CLK_LEVELS [CLK_LEVELS ...] Sets PCIe Bandwith
-S SCLKLEVEL SCLK, --slevel SCLKLEVEL SCLK Change GPU clock frequency and voltage for a specific level
-M MCLKLEVEL MCLK, --mlevel MCLKLEVEL MCLK Change GPU memory frequency and voltage for a specific level
-V POINT SCLK SVOLT, --vc POINT SCLK SVOLT Change SCLK voltage curve for a specified point
-r SCLKMIN SCLKMAX, --srange SCLKMIN SCLKMAX Sets min and max SCLK speed
-R MCLKMIN MCLKMAX, --mrange MCLKMIN MCLKMAX Sets min and max MCLK speed
-f %, --fan % Sets GPU fan speed (0-255 or 0-100%)
-l LEVEL, --perflevel LEVEL Sets performance level
-o %, --overdrive % Set GPU overdrive (0-20%) ***DEPRECATED IN NEWER KERNEL VERSIONS (use --slevel instead)***
-O %, --memoverdrive % Set memory overclock overdrive level ***DEPRECATED IN NEWER KERNEL VERSIONS (use --mlevel instead)***
-w WATTS, --poweroverdrive WATTS Set the maximum GPU power using power overdrive in Watts
-P SETPROFILE, --profile SETPROFILE Set power profile level (#) or a quoted string of custom profile attributes
-d SCLKMAX, --perfdeterminism SCLKMAX Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation
@@ -324,7 +306,7 @@ Command Modifiers:
amd-smi reset --help
usage: amd-smi reset [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...]
[-G] [-c] [-f] [-p] [-o] [-x] [-d]
[-G] [-c] [-f] [-p] [-x] [-d]
A GPU must be specified to reset a configuration.
A reset argument must be provided; Multiple reset arguments are accepted
@@ -337,7 +319,6 @@ Reset Arguments:
-c, --clocks Reset clocks and overdrive to default
-f, --fans Reset fans to automatic (driver) control
-p, --profile Reset power profile back to default
-o, --poweroverdrive Set the maximum GPU power back to the device default state
-x, --xgmierr Reset XGMI error counts
-d, --perfdeterminism Disable performance determinism
+14 -333
Просмотреть файл
@@ -302,7 +302,7 @@ class AMDSMICommands():
unit = 'W'
power_limit = f"{power_limit} {unit}"
unit = 'C'
unit = '\N{DEGREE SIGN}C'
temp_edge_limit = f"{temp_edge_limit} {unit}"
temp_junction_limit = f"{temp_junction_limit} {unit}"
temp_vram_limit = f"{temp_vram_limit} {unit}"
@@ -594,8 +594,8 @@ class AMDSMICommands():
def metric(self, args, multiple_devices=False, watching_output=False, gpu=None,
usage=None, watch=None, watch_time=None, iterations=None, fb_usage=None, power=None,
clock=None, temperature=None, ecc=None, ecc_block=None, pcie=None, voltage=None,
fan=None, voltage_curve=None, overdrive=None, mem_overdrive=None, perf_level=None,
clock=None, temperature=None, ecc=None, ecc_block=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
replay_count=None, xgmi_err=None, energy=None, mem_usage=None):
"""Get Metric information for target gpu
@@ -615,11 +615,9 @@ class AMDSMICommands():
ecc (bool, optional): Value override for args.ecc. Defaults to None.
ecc_block (bool, optional): Value override for args.ecc. Defaults to None.
pcie (bool, optional): Value override for args.pcie. Defaults to None.
voltage (bool, optional): Value override for args.voltage. Defaults to None.
fan (bool, optional): Value override for args.fan. Defaults to None.
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
overdrive (bool, optional): Value override for args.overdrive. Defaults to None.
mem_overdrive (bool, optional): Value override for args.mem_overdrive. Defaults to None.
perf_level (bool, optional): Value override for args.perf_level. Defaults to None.
replay_count (bool, optional): Value override for args.replay_count. Defaults to None.
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
@@ -663,16 +661,12 @@ class AMDSMICommands():
args.ecc_block = ecc_block
if pcie:
args.pcie = pcie
if voltage:
args.voltage = voltage
if fan:
args.fan = fan
if voltage_curve:
args.voltage_curve = voltage_curve
if overdrive:
args.overdrive = overdrive
if mem_overdrive:
args.mem_overdrive = mem_overdrive
if perf_level:
args.perf_level = perf_level
if xgmi_err:
@@ -726,12 +720,14 @@ class AMDSMICommands():
args.fb_usage = args.replay_count = args.mem_usage = self.all_arguments = True
if self.helpers.is_linux() and self.helpers.is_baremetal():
if not any([args.usage, args.fb_usage, args.power, args.clock, args.temperature, args.ecc, args.ecc_block, args.pcie, args.voltage, args.fan,
args.voltage_curve, args.overdrive, args.mem_overdrive, args.perf_level,
args.replay_count, args.xgmi_err, args.energy, args.mem_usage]):
args.usage = args.fb_usage = args.power = args.clock = args.temperature = args.ecc = args.ecc_block = args.pcie = args.voltage = args.fan = \
args.voltage_curve = args.overdrive = args.mem_overdrive = args.perf_level = \
args.replay_count = args.xgmi_err = args.energy = args.mem_usage = self.all_arguments = True
if not any([args.usage, args.fb_usage, args.power, args.clock, args.temperature,
args.ecc, args.ecc_block, args.pcie, args.fan, args.voltage_curve,
args.overdrive, args.perf_level, args.replay_count, args.xgmi_err,
args.energy, args.mem_usage]):
args.usage = args.fb_usage = args.power = args.clock = args.temperature = \
args.ecc = args.ecc_block = args.pcie = args.fan = args.voltage_curve = \
args.overdrive = args.perf_level = args.replay_count = args.xgmi_err = \
args.energy = args.mem_usage = self.all_arguments = True
# Add timestamp and store values for specified arguments
values_dict = {}
@@ -905,7 +901,6 @@ class AMDSMICommands():
values_dict['ecc_block'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.pcie:
try:
pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_caps(args.gpu)
@@ -922,21 +917,6 @@ class AMDSMICommands():
values_dict['pcie'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.voltage:
try:
volt_metric = amdsmi_interface.amdsmi_get_gpu_volt_metric(
args.gpu, amdsmi_interface.AmdSmiVoltageType.VDDGFX, amdsmi_interface.AmdSmiVoltageMetric.CURRENT)
if self.logger.is_human_readable_format():
unit = 'mV'
volt_metric = f"{volt_metric} {unit}"
values_dict['voltage'] = volt_metric
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['voltage'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.fan:
try:
fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(args.gpu, 0)
@@ -1000,8 +980,6 @@ class AMDSMICommands():
values_dict['overdrive'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.mem_overdrive:
values_dict['mem_overdrive'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.STATUS_NOT_YET_IMPLEMENTED).err_info
if args.perf_level:
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
@@ -1462,30 +1440,16 @@ class AMDSMICommands():
self.logger.print_output(multiple_device_enabled=True)
def set_value(self, args, multiple_devices=False, gpu=None, clock=None, sclk=None, mclk=None,
pcie=None, slevel=None, mlevel=None, vc=None, srange=None, mrange=None,
fan=None, perflevel=None, overdrive=None, memoverdrive=None,
poweroverdrive=None, profile=None, perfdeterminism=None):
def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perflevel=None,
profile=None, perfdeterminism=None):
"""Issue reset commands to target gpu(s)
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
clock ((amdsmi_interface.AmdSmiClkType, int), optional): Value override for args.clock. Defaults to None.
sclk (int, optional): Value override for args.sclk. Defaults to None.
mclk (int, optional): Value override for args.mclk. Defaults to None.
pcie (int, optional): Value override for args.pcie. Defaults to None.
slevel ((amdsmi_interface.AmdSmiFreqInd), int), optional): Value override for args.slevel. Defaults to None.
mlevel ((amdsmi_interface.AmdSmiFreqInd), optional): Value override for args.mlevel. Defaults to None.
vc ((int, int, int), optional): Value override for args.vc. Defaults to None.
srange ((int, int), optional): Value override for args.srange. Defaults to None.
mrange ((int, int), optional): Value override for args.mrange. Defaults to None.
fan (int, optional): Value override for args.fan. Defaults to None.
perflevel (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perflevel. Defaults to None.
overdrive (int, optional): Value override for args.overdrive. Defaults to None.
memoverdrive (int, optional): Value override for args.memoverdrive. Defaults to None.
poweroverdrive (int, optional): Value override for args.poweroverdrive. Defaults to None.
profile (bool, optional): Value override for args.profile. Defaults to None.
perfdeterminism (int, optional): Value override for args.perfdeterminism. Defaults to None.
@@ -1499,34 +1463,10 @@ class AMDSMICommands():
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
if clock:
args.clock = clock
if sclk:
args.sclk = sclk
if mclk:
args.mclk = mclk
if pcie:
args.pcie = pcie
if slevel:
args.slevel = slevel
if mlevel:
args.mlevel = mlevel
if vc:
args.vc = vc
if srange:
args.srange = srange
if mrange:
args.mrange = mrange
if fan:
args.fan = fan
if perflevel:
args.perflevel = perflevel
if overdrive:
args.overdrive = overdrive
if memoverdrive:
args.memoverdrive = memoverdrive
if poweroverdrive:
args.poweroverdrive = poweroverdrive
if profile:
args.profile = profile
if perfdeterminism:
@@ -1555,178 +1495,6 @@ class AMDSMICommands():
gpu_string = f"GPU ID: {gpu_id} BDF:{gpu_bdf}"
# Handle args
if args.clock:
clock_type, freq_bitmask = args.clock
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
if clock_type != amdsmi_interface.AmdSmiClkType.PCIE:
try:
amdsmi_interface.amdsmi_set_clk_freq(args.gpu, clock_type, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
else:
try:
amdsmi_interface.amdsmi_set_gpu_pci_bandwidth(args.gpu, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
self.logger.store_output(args.gpu, 'clock', f'Successfully set clock frequency bitmask for {clock_type}')
if isinstance(args.sclk, int):
freq_bitmask = args.sclk
clock_type = amdsmi_interface.AmdSmiClkType.SYS
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_set_clk_freq(args.gpu, clock_type, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
self.logger.store_output(args.gpu, 'sclk', 'Successfully set clock frequency bitmask')
if isinstance(args.mclk, int):
freq_bitmask = args.mclk
clock_type = amdsmi_interface.AmdSmiClkType.MEM
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_set_clk_freq(args.gpu, clock_type, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
self.logger.store_output(args.gpu, 'mclk', 'Successfully set clock frequency bitmask')
if isinstance(args.pcie, int):
freq_bitmask = args.pcie
clock_type = amdsmi_interface.AmdSmiClkType.PCIE
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_set_gpu_pci_bandwidth(args.gpu, freq_bitmask)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e
self.logger.store_output(args.gpu, 'pcie', 'Successfully set clock frequency bitmask')
if isinstance(args.slevel, int):
level, value = args.slevel
level = amdsmi_interface.AmdSmiFreqInd(level)
clock_type = amdsmi_interface.AmdSmiClkType.SYS
try:
amdsmi_interface.amdsmi_set_gpu_od_clk_info(args.gpu, level, value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e
self.logger.store_output(args.gpu, 'slevel', 'Successfully changed clock frequency')
if isinstance(args.mlevel, int):
level, value = args.mlevel
level = amdsmi_interface.AmdSmiFreqInd(level)
clock_type = amdsmi_interface.AmdSmiClkType.MEM
try:
amdsmi_interface.amdsmi_set_gpu_od_clk_info(args.gpu, level, value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e
self.logger.store_output(args.gpu, 'mlevel', 'Successfully changed clock frequency')
if isinstance(args.vc, int):
point, clk, volt = args.vc
try:
amdsmi_interface.amdsmi_set_gpu_od_volt_info(args.gpu, point, clk, volt)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the Voltage Curve point {point} to {clk}(MHz) {volt}(mV) on {gpu_string}") from e
self.logger.store_output(args.gpu, 'vc', f'Successfully set voltage point {point} to {clk}(MHz) {volt}(mV)')
if isinstance(args.srange, int):
min_value, max_value = args.srange
clock_type = amdsmi_interface.AmdSmiClkType.SYS
try:
amdsmi_interface.amdsmi_set_gpu_clk_range(args.gpu, min_value, max_value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e
self.logger.store_output(args.gpu, 'srange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)")
if isinstance(args.mrange, int):
min_value, max_value = args.srange
clock_type = amdsmi_interface.AmdSmiClkType.MEM
try:
amdsmi_interface.amdsmi_set_gpu_clk_range(args.gpu, min_value, max_value, clock_type)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e
self.logger.store_output(args.gpu, 'mrange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)")
if isinstance(args.fan, int):
try:
amdsmi_interface.amdsmi_set_gpu_fan_speed(args.gpu, 0, args.fan)
@@ -1746,89 +1514,6 @@ class AMDSMICommands():
raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perflevel}")
if isinstance(args.overdrive, int):
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
try:
amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, args.overdrive)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set overdrive {args.overdrive} to {gpu_string}") from e
self.logger.store_output(args.gpu, 'overdrive', f"Successfully to set overdrive level to {args.overdrive}")
if isinstance(args.memoverdrive, int):
# Check if the performance level is manual, if not then set it to manual
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get performance level of {gpu_string}") from e
if 'manual' in perf_level.lower():
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e
self.logger.store_output(args.gpu, 'memoverdrive', f"Successfully to set memoverdrive level to {args.memoverdrive}")
if isinstance(args.poweroverdrive, int):
overdrive_power_cap = args.poweroverdrive
try:
power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get the power cap info for {gpu_string}") from e
if overdrive_power_cap == 0:
overdrive_power_cap = power_caps['default_power_cap']
else:
overdrive_power_cap *= 1000000
if overdrive_power_cap < power_caps['min_power_cap']:
raise ValueError(f"Requested power cap: {overdrive_power_cap} is lower than the min power cap: {power_caps['min_power_cap']}")
if overdrive_power_cap > power_caps['max_power_cap']:
raise ValueError(f"Requested power cap: {overdrive_power_cap} is greater than the max power cap: {power_caps['max_power_cap']}")
if overdrive_power_cap == power_caps['power_cap']:
raise ValueError(f"Requested power cap: {overdrive_power_cap} is the same as the current power cap: {power_caps['power_cap']}")
try:
amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, overdrive_power_cap)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set power cap to {overdrive_power_cap} on {gpu_string}") from e
try:
power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to get the power cap info for {gpu_string} post set") from e
if power_caps['power_cap'] == overdrive_power_cap:
self.logger.store_output(args.gpu, 'power_cap', f"Successfully set the power cap {overdrive_power_cap}")
else:
raise ValueError(f"Power cap: {overdrive_power_cap} set failed on {gpu_string}")
if args.profile:
self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented")
if isinstance(args.perfdeterminism, int):
@@ -1849,8 +1534,7 @@ class AMDSMICommands():
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
clocks=None, fans=None, profile=None,
poweroverdrive=None, xgmierr=None, perfdeterminism=None):
clocks=None, fans=None, profile=None, xgmierr=None, perfdeterminism=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -1861,7 +1545,6 @@ class AMDSMICommands():
clocks (bool, optional): Value override for args.clocks. Defaults to None.
fans (bool, optional): Value override for args.fans. Defaults to None.
profile (bool, optional): Value override for args.profile. Defaults to None.
poweroverdrive (bool, optional): Value override for args.poweroverdrive. Defaults to None.
xgmierr (bool, optional): Value override for args.xgmierr. Defaults to None.
perfdeterminism (bool, optional): Value override for args.perfdeterminism. Defaults to None.
@@ -1883,8 +1566,6 @@ class AMDSMICommands():
args.fans = fans
if profile:
args.profile = profile
if poweroverdrive:
args.poweroverdrive = poweroverdrive
if xgmierr:
args.xgmierr = xgmierr
if perfdeterminism:
-1
Просмотреть файл
@@ -25,7 +25,6 @@ import platform
import sys
import time
from pathlib import Path
from subprocess import run
from subprocess import PIPE, STDOUT
+11 -33
Просмотреть файл
@@ -127,6 +127,15 @@ class AMDSMIParser(argparse.ArgumentParser):
path.touch()
setattr(args, self.dest, path)
elif path.is_file():
file_name = str(path)
if args.json and str(path).split('.')[-1].lower() != 'json':
file_name += ".json"
elif args.csv and str(path).split('.')[-1].lower() != 'csv':
file_name += ".csv"
elif str(path).split('.')[-1].lower() != 'txt':
file_name += ".txt"
path = Path(file_name)
path.touch()
setattr(args, self.dest, path)
else:
raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat)
@@ -415,13 +424,11 @@ class AMDSMIParser(argparse.ArgumentParser):
ecc_help = "Number of ECC errors"
ecc_block_help = "Number of ECC errors per block"
pcie_help = "Current PCIe speed and width"
voltage_help = "Current GPU voltages"
# Help text for Arguments only on Linux Baremetal platforms
fan_help = "Current fan speed"
vc_help = "Display voltage curve"
overdrive_help = "Current GPU clock overdrive level"
mo_help = "Current memory clock overdrive level"
perf_level_help = "Current DPM performance level"
replay_count_help = "PCIe replay count"
xgmi_err_help = "XGMI error information since last read"
@@ -450,7 +457,6 @@ class AMDSMIParser(argparse.ArgumentParser):
if self.helpers.is_virtual_os() or self.helpers.is_baremetal():
metric_parser.add_argument('-b', '--fb-usage', action='store_true', required=False, help=fb_usage_help)
metric_parser.add_argument('-m', '--mem-usage', action='store_true', required=False, help=mem_usage_help)
metric_parser.add_argument('-r', '--replay-count', action='store_true', required=False, help=replay_count_help)
# Optional Args for Hypervisors and Baremetal systems
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
@@ -459,9 +465,8 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
metric_parser.add_argument('-k', '--ecc-block', action='store_true', required=False, help=ecc_block_help)
metric_parser.add_argument('-r', '--replay-count', action='store_true', required=False, help=replay_count_help)
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help)
metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help)
# Optional Args for Linux Baremetal Systems
@@ -469,7 +474,6 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help)
metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help)
metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help)
metric_parser.add_argument('-M', '--mem-overdrive', action='store_true', required=False, help=mo_help)
metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help)
metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help)
metric_parser.add_argument('-E', '--energy', action='store_true', required=False, help=energy_help)
@@ -544,7 +548,7 @@ class AMDSMIParser(argparse.ArgumentParser):
def _add_event_parser(self, subparsers, func):
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
# This subparser only applies to Linux BareMetal & Linux Hypervisors, NOT Linux Guest
# This subparser only applies to Linux Hypervisors, NOT Linux Guest
return
# Subparser help text
@@ -611,20 +615,8 @@ class AMDSMIParser(argparse.ArgumentParser):
set_value_optionals_title = "Set Arguments"
# Help text for Arguments only on Guest and BM platforms
set_clock_help = "Sets clock frequency levels for specified clocks"
set_sclk_help = "Sets GPU clock frequency levels"
set_mclk_help = "Sets memory clock frequency levels"
set_pcie_help = "Sets PCIe Bandwith"
set_slevel_help = "Change GPU clock frequency and voltage for a specific level"
set_mlevel_help = "Change GPU memory frequency and voltage for a specific level"
set_vc_help = "Change SCLK voltage curve for a specified point"
set_srange_help = "Sets min and max SCLK speed"
set_mrange_help = "Sets min and max MCLK speed"
set_fan_help = "Sets GPU fan speed (0-255 or 0-100%%)"
set_perf_level_help = "Sets performance level"
set_overdrive_help = "Set GPU overdrive (0-20%%) ***DEPRECATED IN NEWER KERNEL VERSIONS (use --slevel instead)***"
set_mem_overdrive_help = "Set memory overclock overdrive level ***DEPRECATED IN NEWER KERNEL VERSIONS (use --mlevel instead)***"
set_power_overdrive_help = "Set the maximum GPU power using power overdrive in Watts"
set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes"
set_perf_det_help = "Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation"
@@ -639,20 +631,8 @@ class AMDSMIParser(argparse.ArgumentParser):
self._add_device_arguments(set_value_parser, required=True)
# Optional Args
set_value_parser.add_argument('-c', '--clock', action=self._validate_set_clock(True), nargs='+', required=False, help=set_clock_help, metavar=('CLK_TYPE', 'CLK_LEVELS'))
set_value_parser.add_argument('-s', '--sclk', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_sclk_help, metavar='CLK_LEVELS')
set_value_parser.add_argument('-m', '--mclk', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_mclk_help, metavar='CLK_LEVELS')
set_value_parser.add_argument('-p', '--pcie', action=self._validate_set_clock(False), nargs='+', type=self._positive_int, required=False, help=set_pcie_help, metavar='CLK_LEVELS')
set_value_parser.add_argument('-S', '--slevel', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_slevel_help, metavar=('SCLKLEVEL', 'SCLK'))
set_value_parser.add_argument('-M', '--mlevel', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_mlevel_help, metavar=('MCLKLEVEL', 'MCLK'))
set_value_parser.add_argument('-V', '--vc', action=self._prompt_spec_warning(), nargs=3, type=self._positive_int, required=False, help=set_vc_help, metavar=('POINT', 'SCLK', 'SVOLT'))
set_value_parser.add_argument('-r', '--srange', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_srange_help, metavar=('SCLKMIN', 'SCLKMAX'))
set_value_parser.add_argument('-R', '--mrange', action=self._prompt_spec_warning(), nargs=2, type=self._positive_int, required=False, help=set_mrange_help, metavar=('MCLKMIN', 'MCLKMAX'))
set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%')
set_value_parser.add_argument('-l', '--perflevel', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
set_value_parser.add_argument('-o', '--overdrive', action=self._validate_overdrive_percent(), required=False, help=set_overdrive_help, metavar='%')
set_value_parser.add_argument('-O', '--memoverdrive', action=self._validate_overdrive_percent(), required=False, help=set_mem_overdrive_help, metavar='%')
set_value_parser.add_argument('-w', '--poweroverdrive', action=self._prompt_spec_warning(), type=self._positive_int, required=False, help=set_power_overdrive_help, metavar="WATTS")
set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE')
set_value_parser.add_argument('-d', '--perfdeterminism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX')
@@ -766,7 +746,6 @@ class AMDSMIParser(argparse.ArgumentParser):
resetclocks_help = "Reset clocks and overdrive to default"
resetfans_help = "Reset fans to automatic (driver) control"
resetprofile_help = "Reset power profile back to default"
resetpoweroverdrive_help = "Set the maximum GPU power back to the device default state"
resetxgmierr_help = "Reset XGMI error counts"
resetperfdet_help = "Disable performance determinism"
@@ -785,7 +764,6 @@ class AMDSMIParser(argparse.ArgumentParser):
reset_parser.add_argument('-c', '--clocks', action='store_true', required=False, help=resetclocks_help)
reset_parser.add_argument('-f', '--fans', action='store_true', required=False, help=resetfans_help)
reset_parser.add_argument('-p', '--profile', action='store_true', required=False, help=resetprofile_help)
reset_parser.add_argument('-o', '--poweroverdrive', action='store_true', required=False, help=resetpoweroverdrive_help)
reset_parser.add_argument('-x', '--xgmierr', action='store_true', required=False, help=resetxgmierr_help)
reset_parser.add_argument('-d', '--perfdeterminism', action='store_true', required=False, help=resetperfdet_help)