Removed Throttle Status from CLI Tool

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: I8eb8f30f821589003201d6d8bb96592ec5f8a483
This commit is contained in:
Maisam Arif
2024-06-06 16:17:49 -05:00
rodzic 936451e286
commit 37c044696d
7 zmienionych plików z 13 dodań i 30 usunięć
+3
Wyświetl plik
@@ -88,6 +88,9 @@ ASIC products. This requires users to update any ABIs using this structure.
### Fixes
- **Removed `throttle-status` from `amd-smi monitor` as it is no longer reliably supported**.
Throttle status may work for older ASICs, but will be replaced with PVIOL and TVIOL metrics for future ASIC support. It remains a field in the gpu_metrics API and in `amd-smi metric --power`.
- **`amdsmi_get_gpu_board_info()` no longer returns junk char strings**.
Previously if there was a partial failure to retrieve character strings, we would return
garbage output to users using the API. This fix intends to populate as many values as possible.
+1 -2
Wyświetl plik
@@ -594,7 +594,7 @@ Command Modifiers:
usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
[-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]]
[-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n]
[-d] [-s] [-e] [-v] [-r]
[-d] [-e] [-v] [-r]
Monitor a target device for the specified arguments.
If no arguments are provided, all arguments will be enabled.
@@ -626,7 +626,6 @@ Monitor Arguments:
-m, --mem Monitor memory utilization (%) and clock (MHz)
-n, --encoder Monitor encoder utilization (%) and clock (MHz)
-d, --decoder Monitor decoder utilization (%) and clock (MHz)
-s, --throttle-status Monitor thermal throttle status
-e, --ecc Monitor ECC single bit, ECC double bit, and PCIe replay error counts
-v, --vram-usage Monitor memory usage in MB
-r, --pcie Monitor PCIe bandwidth in Mb/s
+5 -22
Wyświetl plik
@@ -1328,8 +1328,8 @@ class AMDSMICommands():
'gfx_voltage': "N/A",
'soc_voltage': "N/A",
'mem_voltage': "N/A",
'power_management': "N/A",
'throttle_status': "N/A"}
'throttle_status': "N/A",
'power_management': "N/A"}
try:
voltage_unit = "mV"
@@ -3968,7 +3968,7 @@ class AMDSMICommands():
def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
watch=None, watch_time=None, iterations=None, power_usage=None,
temperature=None, gfx_util=None, mem_util=None, encoder=None, decoder=None,
throttle_status=None, ecc=None, vram_usage=None, pcie=None):
ecc=None, vram_usage=None, pcie=None):
""" Populate a table with each GPU as an index to rows of targeted data
Args:
@@ -3984,7 +3984,6 @@ class AMDSMICommands():
mem (bool, optional): Value override for args.mem. Defaults to None.
encoder (bool, optional): Value override for args.encoder. Defaults to None.
decoder (bool, optional): Value override for args.decoder. Defaults to None.
throttle_status (bool, optional): Value override for args.throttle_status. Defaults to None.
ecc (bool, optional): Value override for args.ecc. Defaults to None.
vram_usage (bool, optional): Value override for args.vram_usage. Defaults to None.
pcie (bool, optional): Value override for args.pcie. Defaults to None.
@@ -4019,8 +4018,6 @@ class AMDSMICommands():
args.encoder = encoder
if decoder:
args.decoder = decoder
if throttle_status:
args.throttle_status = throttle_status
if ecc:
args.ecc = ecc
if vram_usage:
@@ -4034,10 +4031,10 @@ class AMDSMICommands():
# If all arguments are False, the print all values
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
args.encoder, args.decoder, args.throttle_status, args.ecc,
args.encoder, args.decoder, args.ecc,
args.vram_usage, args.pcie]):
args.power_usage = args.temperature = args.gfx = args.mem = \
args.encoder = args.decoder = args.throttle_status = args.ecc = \
args.encoder = args.decoder = args.ecc = \
args.vram_usage = args.pcie = True
# Handle watch logic, will only enter this block once
@@ -4282,20 +4279,6 @@ class AMDSMICommands():
logging.debug("Failed to get decoder clock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'DEC_CLOCK'.rjust(11)
if args.throttle_status:
try:
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status']
if throttle_status != "N/A":
if throttle_status:
throttle_status = "THROTTLED"
else:
throttle_status = "UNTHROTTLED"
monitor_values['throttle_status'] = throttle_status
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['throttle_status'] = "N/A"
logging.debug("Failed to get throttle status on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'THROTTLE'.rjust(13)
if args.ecc:
try:
ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
+2 -2
Wyświetl plik
@@ -104,8 +104,8 @@ class AMDSMIHelpers():
if string_format:
return f"{operating_system} {operating_system_type}"
else:
return (operating_system, operating_system_type)
return (operating_system, operating_system_type)
def is_virtual_os(self):
+1 -1
Wyświetl plik
@@ -116,7 +116,7 @@ class AMDSMILogger():
table_values += value.rjust(11)
elif key == 'vram_total' or 'ecc' in key:
table_values += value.rjust(12)
elif key in ('throttle_status', 'pcie_replay'):
elif key in ['pcie_replay']:
table_values += value.rjust(13)
# Only for handling topology tables
elif 'gpu_' in key:
-2
Wyświetl plik
@@ -1113,7 +1113,6 @@ class AMDSMIParser(argparse.ArgumentParser):
mem_util_help = "Monitor memory utilization (%%) and clock (MHz)"
encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)"
decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)"
throttle_help = "Monitor thermal throttle status"
ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts"
mem_usage_help = "Monitor memory usage in MB"
pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s"
@@ -1136,7 +1135,6 @@ class AMDSMIParser(argparse.ArgumentParser):
monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help)
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help)
+1 -1
Wyświetl plik
@@ -2257,7 +2257,7 @@ Output: Dictionary with fields
`current_dclk0` | Current dclk0 | MHz
`current_vclk1` | Current vclk1 | MHz
`current_dclk1` | Current dclk1 | MHz
`throttle_status` | Current throttle status | MHz
`throttle_status` | Current throttle status | bool
`current_fan_speed` | Current fan speed | RPM
`pcie_link_width` | PCIe link width (number of lanes) | lanes
`pcie_link_speed` | PCIe link speed in 0.1 GT/s (Giga Transfers per second) | GT/s