diff --git a/CHANGELOG.md b/CHANGELOG.md index 536bf7cfdc..cbe903cdf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -229,6 +229,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - Increasing available JPEG engines to 40. Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI. +- **Added support to get GPU voltage**. + ```shell + $ amd-smi metric --voltage + GPU: 0 + VOLTAGE: + VDDBOARD: 52536 mV + ... + ``` - **Added bad page threshold count**. - Added `amdsmi_get_gpu_bad_page_threshold` to Python API and CLI; root/sudo permissions required to display the count. diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 5ea59c0f8f..ffc91c34e1 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1383,7 +1383,7 @@ class AMDSMICommands(): usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, fan=None, voltage_curve=None, overdrive=None, perf_level=None, - xgmi_err=None, energy=None, mem_usage=None, schedule=None, + xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None, ): """Get Metric information for target gpu @@ -1410,6 +1410,7 @@ class AMDSMICommands(): xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. energy (bool, optional): Value override for args.energy. Defaults to None. mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None. + voltage (bool, optional): Value override for args.voltage. Defaults to None. schedule (bool, optional): Value override for args.schedule. Defaults to None. guard (bool, optional): Value override for args.guard. Defaults to None. guest_data (bool, optional): Value override for args.guest_data. Defaults to None. @@ -1458,9 +1459,9 @@ class AMDSMICommands(): args.ecc = ecc if ecc_blocks: args.ecc_blocks = ecc_blocks - current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"] + current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks"] current_platform_values += [args.usage, args.power, args.clock, - args.temperature, args.pcie] + args.temperature, args.voltage, args.pcie] current_platform_values += [args.ecc, args.ecc_blocks] if self.helpers.is_baremetal() and self.helpers.is_linux(): @@ -2385,6 +2386,22 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: values_dict['xgmi_err'] = "N/A" logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info()) + if "voltage" in current_platform_args: + if args.voltage: + voltage_dict = {} + all_voltage = { + "vddboard": amdsmi_interface.AmdSmiVoltageType.VDDBOARD + } + for volt_type, volt_metric in all_voltage.items(): + try: + voltage = amdsmi_interface.amdsmi_get_gpu_volt_metric(args.gpu, volt_metric, amdsmi_interface.AmdSmiVoltageMetric.CURRENT) + if voltage == 0: + voltage = "N/A" + voltage_dict[volt_type] = self.helpers.unit_format(self.logger, voltage, "mV") + except amdsmi_exception.AmdSmiLibraryException as e: + voltage_dict[volt_type] = "N/A" + logging.debug("Failed to get voltage for gpu %s | %s", gpu_id, e.get_error_info()) + values_dict['voltage'] = voltage_dict if "energy" in current_platform_args: if args.energy: try: @@ -2991,7 +3008,7 @@ class AMDSMICommands(): usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, fan=None, voltage_curve=None, overdrive=None, perf_level=None, - xgmi_err=None, energy=None, mem_usage=None, schedule=None, + xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None, cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None, cpu_c0_res=None, cpu_lclk_dpm_level=None, cpu_pwr_svi_telemetry_rails=None, @@ -3025,6 +3042,7 @@ class AMDSMICommands(): xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. energy (bool, optional): Value override for args.energy. Defaults to None. mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None. + voltage (bool, optional): Value override for args.voltage. Defaults to None. schedule (bool, optional): Value override for args.schedule. Defaults to None. guard (bool, optional): Value override for args.guard. Defaults to None. guest_data (bool, optional): Value override for args.guest_data. Defaults to None. @@ -3073,7 +3091,7 @@ class AMDSMICommands(): gpu_args_enabled = False gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock", "temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve", - "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule", + "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "voltage", "schedule", "guard", "guest_data", "fb_usage", "xgmi", "throttle"] for attr in gpu_attributes: if hasattr(args, attr): @@ -3146,7 +3164,7 @@ class AMDSMICommands(): usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, fan, voltage_curve, overdrive, perf_level, - xgmi_err, energy, mem_usage, schedule, + xgmi_err, energy, mem_usage, voltage, schedule, guard, guest_data, fb_usage, xgmi, throttle, ) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized @@ -3182,7 +3200,7 @@ class AMDSMICommands(): usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, fan, voltage_curve, overdrive, perf_level, - xgmi_err, energy, mem_usage, schedule, throttle, + xgmi_err, energy, mem_usage, voltage, schedule, throttle, ) if self.logger.is_json_format(): self.logger.combine_arrays_to_json() diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index ba09232b8a..95ed98eda3 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -874,6 +874,7 @@ class AMDSMIParser(argparse.ArgumentParser): ecc_help = "Total number of ECC errors" ecc_blocks_help = "Number of ECC errors per block" pcie_help = "Current PCIe speed, width, and replay count" + voltage_help = "GPU voltage" # Help text for Arguments only on Linux Baremetal platforms fan_help = "Current fan speed" @@ -939,6 +940,7 @@ class AMDSMIParser(argparse.ArgumentParser): metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) + metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help) # Options that only apply to Hypervisors and Baremetal Linux if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()): diff --git a/docs/how-to/amdsmi-cli-tool.md b/docs/how-to/amdsmi-cli-tool.md index 9f5d7e519e..db1b9618c9 100644 --- a/docs/how-to/amdsmi-cli-tool.md +++ b/docs/how-to/amdsmi-cli-tool.md @@ -313,6 +313,7 @@ Metric arguments: -P, --pcie Current PCIe speed, width, and replay count -e, --ecc Total number of ECC errors -k, --ecc-blocks Number of ECC errors per block + -V, --voltage GPU voltage -f, --fan Current fan speed -C, --voltage-curve Display voltage curve -o, --overdrive Current GPU clock overdrive and GPU memory clock overdrive level diff --git a/docs/reference/amdsmi-py-api.md b/docs/reference/amdsmi-py-api.md index e4f17b477c..9f0b2bbbab 100644 --- a/docs/reference/amdsmi-py-api.md +++ b/docs/reference/amdsmi-py-api.md @@ -2112,16 +2112,17 @@ machine guest Input parameters: -* `processor_handle` handle for the given device -* `sensor_type` part of device from which voltage should be obtained -* `metric` enum indicated which voltage value should be retrieved +Parameters | Description +---|--- +`processor_handle` | Handle for the given device +`sensor_type` |
Possible Values Description
`AmdSmiVoltageType.VDDGFX`Represents the voltage supplied to the GPU's graphics core.
`AmdSmiVoltageType.VDDBOARD`Represents the voltage supplied to the entire GPU board, including auxiliary components. Intended for Mi300+
+`metric` |
Possible Values Description
`AmdSmiVoltageMetric.CURRENT`Represents the current voltage value measured at the specified sensor.
`AmdSmiVoltageMetric.MAX`Represents the maximum voltage value recorded at the specified sensor.
`AmdSmiVoltageMetric.MIN`Represents the minimum voltage value recorded at the specified sensor.
`AmdSmiVoltageMetric.AVERAGE`Represents the average voltage value calculated over a period of time at the specified sensor.
`AmdSmiVoltageMetric.MAX_CRIT`Represents the critical maximum voltage value that should not be exceeded.
`AmdSmiVoltageMetric.MIN_CRIT`Represents the critical minimum voltage value that should not be dropped below.
`AmdSmiVoltageMetric.LOWEST`Represents the lowest voltage value recorded during the monitoring period.
`AmdSmiVoltageMetric.HIGHEST`Represents the highest voltage value recorded during the monitoring period.
Output: Voltage as integer in millivolts Exceptions that can be thrown by `amdsmi_get_gpu_volt_metric` function: * `AmdSmiLibraryException` -* `AmdSmiRetryException` * `AmdSmiParameterException` Example: @@ -2133,8 +2134,11 @@ try: print("No GPUs on machine") else: for device in devices: - voltage = amdsmi_get_gpu_volt_metric(device, AmdSmiVoltageType.VDDGFX, - AmdSmiVoltageMetric.AVERAGE) + voltage = amdsmi_get_gpu_volt_metric( + device, + AmdSmiVoltageType.VDDBOARD, + AmdSmiVoltageMetric.AVERAGE + ) print(voltage) except AmdSmiException as e: print(e) @@ -2692,7 +2696,7 @@ except AmdSmiException as e: ### amdsmi_get_gpu_power_profile_presets -Description: Get the list of available preset power profiles and an indication of +Description: Get the list of available preset power profiles and an indication of which profile is currently active. It is not supported on virtual machine guest Input parameters: diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index e6fb728776..2ad4a30e2a 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -1360,7 +1360,8 @@ typedef enum { AMDSMI_VOLT_TYPE_FIRST = 0, AMDSMI_VOLT_TYPE_VDDGFX = AMDSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage - AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDGFX, + AMDSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD + AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDBOARD, AMDSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } amdsmi_voltage_type_t; diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 32f038ac2d..fc942257b0 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -309,6 +309,7 @@ class AmdSmiVoltageMetric(IntEnum): class AmdSmiVoltageType(IntEnum): VDDGFX = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDGFX + VDDBOARD = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDBOARD INVALID = amdsmi_wrapper.AMDSMI_VOLT_TYPE_INVALID class AmdSmiAcceleratorPartitionResourceType(IntEnum): diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 0132aa0522..c915ba8467 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -1530,12 +1530,14 @@ amdsmi_voltage_metric_t = ctypes.c_uint32 # enum amdsmi_voltage_type_t__enumvalues = { 0: 'AMDSMI_VOLT_TYPE_FIRST', 0: 'AMDSMI_VOLT_TYPE_VDDGFX', - 0: 'AMDSMI_VOLT_TYPE_LAST', + 1: 'AMDSMI_VOLT_TYPE_VDDBOARD', + 1: 'AMDSMI_VOLT_TYPE_LAST', 4294967295: 'AMDSMI_VOLT_TYPE_INVALID', } AMDSMI_VOLT_TYPE_FIRST = 0 AMDSMI_VOLT_TYPE_VDDGFX = 0 -AMDSMI_VOLT_TYPE_LAST = 0 +AMDSMI_VOLT_TYPE_VDDBOARD = 1 +AMDSMI_VOLT_TYPE_LAST = 1 AMDSMI_VOLT_TYPE_INVALID = 4294967295 amdsmi_voltage_type_t = ctypes.c_uint32 # enum @@ -3161,6 +3163,7 @@ __all__ = \ 'AMDSMI_VOLT_MAX_CRIT', 'AMDSMI_VOLT_MIN', 'AMDSMI_VOLT_MIN_CRIT', 'AMDSMI_VOLT_TYPE_FIRST', 'AMDSMI_VOLT_TYPE_INVALID', 'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDGFX', + 'AMDSMI_VOLT_TYPE_VDDBOARD', 'AMDSMI_VRAM_TYPE_DDR2', 'AMDSMI_VRAM_TYPE_DDR3', 'AMDSMI_VRAM_TYPE_DDR4', 'AMDSMI_VRAM_TYPE_GDDR1', 'AMDSMI_VRAM_TYPE_GDDR2', 'AMDSMI_VRAM_TYPE_GDDR3', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index e73a2d3ec0..2b4476e0c0 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -574,9 +574,10 @@ typedef enum { typedef enum { RSMI_VOLT_TYPE_FIRST = 0, - RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU - //!< voltage - RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX, + RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage + RSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD + + RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDBOARD, RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_voltage_type_t; diff --git a/rocm_smi/src/rocm_smi_monitor.cc b/rocm_smi/src/rocm_smi_monitor.cc index 85c204dd6c..cb6b36dd01 100644 --- a/rocm_smi/src/rocm_smi_monitor.cc +++ b/rocm_smi/src/rocm_smi_monitor.cc @@ -88,7 +88,7 @@ static const char *kTempSensorTypeJunctionName = "junction"; static const char *kTempSensorTypeEdgeName = "edge"; static const char *kTempSensorTypeVddgfxName = "vddgfx"; - +static const char *kTempSensorTypeVddboardName = "vddboard"; static const std::map kTempSensorNameMap = { @@ -100,6 +100,7 @@ static const std::map static const std::map kVoltSensorNameMap = { {kTempSensorTypeVddgfxName, RSMI_VOLT_TYPE_VDDGFX}, + {kTempSensorTypeVddboardName, RSMI_VOLT_TYPE_VDDBOARD}, }; static const std::map kMonitorNameMap = { @@ -378,7 +379,13 @@ Monitor::setVoltSensorLabelMap(void) { }; for (uint32_t i = 0; i < RSMI_VOLT_TYPE_LAST + 1; ++i) { - ret = add_volt_sensor_entry(i); + // VDDGFX -> 0, VDDNB -> 1, VDDBOARD -> 2 + // Here the VDDNB will be skipped as it is not defined in the enum and not supported by AMD. + auto file_index = i; + if (i >= RSMI_VOLT_TYPE_VDDBOARD) { + file_index = i + 1; + } + ret = add_volt_sensor_entry(file_index); if (ret) { return ret; } diff --git a/tests/amd_smi_test/test_common.cc b/tests/amd_smi_test/test_common.cc index f565d0e525..58a313cb9f 100644 --- a/tests/amd_smi_test/test_common.cc +++ b/tests/amd_smi_test/test_common.cc @@ -123,6 +123,7 @@ static const char* short_options = "i:v:m:fr"; static const std::map kVoltSensorNameMap = { {AMDSMI_VOLT_TYPE_VDDGFX, "Vddgfx"}, + {AMDSMI_VOLT_TYPE_VDDBOARD, "Vddboard"}, }; static void PrintHelp(void) {