diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 0fca41aedf..287351d562 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -88,6 +88,9 @@ ASIC products. This requires users to update any ABIs using this structure. ### Fixes +- **Removed `throttle-status` from `amd-smi monitor` as it is no longer reliably supported**. +Throttle status may work for older ASICs, but will be replaced with PVIOL and TVIOL metrics for future ASIC support. It remains a field in the gpu_metrics API and in `amd-smi metric --power`. + - **`amdsmi_get_gpu_board_info()` no longer returns junk char strings**. Previously if there was a partial failure to retrieve character strings, we would return garbage output to users using the API. This fix intends to populate as many values as possible. diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md index c4fc361ec0..d87ec235b6 100644 --- a/projects/amdsmi/amdsmi_cli/README.md +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -594,7 +594,7 @@ Command Modifiers: usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n] - [-d] [-s] [-e] [-v] [-r] + [-d] [-e] [-v] [-r] Monitor a target device for the specified arguments. If no arguments are provided, all arguments will be enabled. @@ -626,7 +626,6 @@ Monitor Arguments: -m, --mem Monitor memory utilization (%) and clock (MHz) -n, --encoder Monitor encoder utilization (%) and clock (MHz) -d, --decoder Monitor decoder utilization (%) and clock (MHz) - -s, --throttle-status Monitor thermal throttle status -e, --ecc Monitor ECC single bit, ECC double bit, and PCIe replay error counts -v, --vram-usage Monitor memory usage in MB -r, --pcie Monitor PCIe bandwidth in Mb/s diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 0729b6f0de..49b6b694ff 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -1328,8 +1328,8 @@ class AMDSMICommands(): 'gfx_voltage': "N/A", 'soc_voltage': "N/A", 'mem_voltage': "N/A", - 'power_management': "N/A", - 'throttle_status': "N/A"} + 'throttle_status': "N/A", + 'power_management': "N/A"} try: voltage_unit = "mV" @@ -3968,7 +3968,7 @@ class AMDSMICommands(): def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, watch=None, watch_time=None, iterations=None, power_usage=None, temperature=None, gfx_util=None, mem_util=None, encoder=None, decoder=None, - throttle_status=None, ecc=None, vram_usage=None, pcie=None): + ecc=None, vram_usage=None, pcie=None): """ Populate a table with each GPU as an index to rows of targeted data Args: @@ -3984,7 +3984,6 @@ class AMDSMICommands(): mem (bool, optional): Value override for args.mem. Defaults to None. encoder (bool, optional): Value override for args.encoder. Defaults to None. decoder (bool, optional): Value override for args.decoder. Defaults to None. - throttle_status (bool, optional): Value override for args.throttle_status. Defaults to None. ecc (bool, optional): Value override for args.ecc. Defaults to None. vram_usage (bool, optional): Value override for args.vram_usage. Defaults to None. pcie (bool, optional): Value override for args.pcie. Defaults to None. @@ -4019,8 +4018,6 @@ class AMDSMICommands(): args.encoder = encoder if decoder: args.decoder = decoder - if throttle_status: - args.throttle_status = throttle_status if ecc: args.ecc = ecc if vram_usage: @@ -4034,10 +4031,10 @@ class AMDSMICommands(): # If all arguments are False, the print all values if not any([args.power_usage, args.temperature, args.gfx, args.mem, - args.encoder, args.decoder, args.throttle_status, args.ecc, + args.encoder, args.decoder, args.ecc, args.vram_usage, args.pcie]): args.power_usage = args.temperature = args.gfx = args.mem = \ - args.encoder = args.decoder = args.throttle_status = args.ecc = \ + args.encoder = args.decoder = args.ecc = \ args.vram_usage = args.pcie = True # Handle watch logic, will only enter this block once @@ -4282,20 +4279,6 @@ class AMDSMICommands(): logging.debug("Failed to get decoder clock on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.table_header += 'DEC_CLOCK'.rjust(11) - if args.throttle_status: - try: - throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status'] - if throttle_status != "N/A": - if throttle_status: - throttle_status = "THROTTLED" - else: - throttle_status = "UNTHROTTLED" - monitor_values['throttle_status'] = throttle_status - except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['throttle_status'] = "N/A" - logging.debug("Failed to get throttle status on gpu %s | %s", gpu_id, e.get_error_info()) - - self.logger.table_header += 'THROTTLE'.rjust(13) if args.ecc: try: ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index f1bc229e21..7db9be1cb1 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -104,8 +104,8 @@ class AMDSMIHelpers(): if string_format: return f"{operating_system} {operating_system_type}" - else: - return (operating_system, operating_system_type) + + return (operating_system, operating_system_type) def is_virtual_os(self): diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 473235b44c..49747694f0 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -116,7 +116,7 @@ class AMDSMILogger(): table_values += value.rjust(11) elif key == 'vram_total' or 'ecc' in key: table_values += value.rjust(12) - elif key in ('throttle_status', 'pcie_replay'): + elif key in ['pcie_replay']: table_values += value.rjust(13) # Only for handling topology tables elif 'gpu_' in key: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index edc188b737..a5ccb6db76 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1113,7 +1113,6 @@ class AMDSMIParser(argparse.ArgumentParser): mem_util_help = "Monitor memory utilization (%%) and clock (MHz)" encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)" decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)" - throttle_help = "Monitor thermal throttle status" ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts" mem_usage_help = "Monitor memory usage in MB" pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s" @@ -1136,7 +1135,6 @@ class AMDSMIParser(argparse.ArgumentParser): monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help) monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help) monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help) - monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help) monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help) monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help) diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 1863a4abff..93f5245248 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -2257,7 +2257,7 @@ Output: Dictionary with fields `current_dclk0` | Current dclk0 | MHz `current_vclk1` | Current vclk1 | MHz `current_dclk1` | Current dclk1 | MHz -`throttle_status` | Current throttle status | MHz +`throttle_status` | Current throttle status | bool `current_fan_speed` | Current fan speed | RPM `pcie_link_width` | PCIe link width (number of lanes) | lanes `pcie_link_speed` | PCIe link speed in 0.1 GT/s (Giga Transfers per second) | GT/s