[SWDEV-520665] Add support for board voltage (#303)

* Add the API and CLI to show the board voltage. 

---------

Change-Id: Icb25bd653bb1d004704b5a21b378ca31b2b242c7
Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com>
Signed-off-by: AL Musaffar, Yazen <Yazen.ALMusaffar@amd.com>

[ROCm/amdsmi commit: 970560fc7c]
Этот коммит содержится в:
Liu, Shuzhou (Bill)
2025-05-29 19:55:08 -04:00
коммит произвёл GitHub
родитель fc54da7679
Коммит ff2e230a34
11 изменённых файлов: 69 добавлений и 22 удалений
+8
Просмотреть файл
@@ -229,6 +229,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- Increasing available JPEG engines to 40.
Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI.
- **Added support to get GPU voltage**.
```shell
$ amd-smi metric --voltage
GPU: 0
VOLTAGE:
VDDBOARD: 52536 mV
...
```
- **Added bad page threshold count**.
- Added `amdsmi_get_gpu_bad_page_threshold` to Python API and CLI; root/sudo permissions required to display the count.
+25 -7
Просмотреть файл
@@ -1383,7 +1383,7 @@ class AMDSMICommands():
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None,
):
"""Get Metric information for target gpu
@@ -1410,6 +1410,7 @@ class AMDSMICommands():
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
energy (bool, optional): Value override for args.energy. Defaults to None.
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
voltage (bool, optional): Value override for args.voltage. Defaults to None.
schedule (bool, optional): Value override for args.schedule. Defaults to None.
guard (bool, optional): Value override for args.guard. Defaults to None.
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
@@ -1458,9 +1459,9 @@ class AMDSMICommands():
args.ecc = ecc
if ecc_blocks:
args.ecc_blocks = ecc_blocks
current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"]
current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks"]
current_platform_values += [args.usage, args.power, args.clock,
args.temperature, args.pcie]
args.temperature, args.voltage, args.pcie]
current_platform_values += [args.ecc, args.ecc_blocks]
if self.helpers.is_baremetal() and self.helpers.is_linux():
@@ -2385,6 +2386,22 @@ class AMDSMICommands():
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['xgmi_err'] = "N/A"
logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info())
if "voltage" in current_platform_args:
if args.voltage:
voltage_dict = {}
all_voltage = {
"vddboard": amdsmi_interface.AmdSmiVoltageType.VDDBOARD
}
for volt_type, volt_metric in all_voltage.items():
try:
voltage = amdsmi_interface.amdsmi_get_gpu_volt_metric(args.gpu, volt_metric, amdsmi_interface.AmdSmiVoltageMetric.CURRENT)
if voltage == 0:
voltage = "N/A"
voltage_dict[volt_type] = self.helpers.unit_format(self.logger, voltage, "mV")
except amdsmi_exception.AmdSmiLibraryException as e:
voltage_dict[volt_type] = "N/A"
logging.debug("Failed to get voltage for gpu %s | %s", gpu_id, e.get_error_info())
values_dict['voltage'] = voltage_dict
if "energy" in current_platform_args:
if args.energy:
try:
@@ -2991,7 +3008,7 @@ class AMDSMICommands():
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None,
cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None,
cpu_c0_res=None, cpu_lclk_dpm_level=None, cpu_pwr_svi_telemetry_rails=None,
@@ -3025,6 +3042,7 @@ class AMDSMICommands():
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
energy (bool, optional): Value override for args.energy. Defaults to None.
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
voltage (bool, optional): Value override for args.voltage. Defaults to None.
schedule (bool, optional): Value override for args.schedule. Defaults to None.
guard (bool, optional): Value override for args.guard. Defaults to None.
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
@@ -3073,7 +3091,7 @@ class AMDSMICommands():
gpu_args_enabled = False
gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock",
"temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve",
"overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule",
"overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "voltage", "schedule",
"guard", "guest_data", "fb_usage", "xgmi", "throttle"]
for attr in gpu_attributes:
if hasattr(args, attr):
@@ -3146,7 +3164,7 @@ class AMDSMICommands():
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule,
xgmi_err, energy, mem_usage, voltage, schedule,
guard, guest_data, fb_usage, xgmi, throttle,
)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
@@ -3182,7 +3200,7 @@ class AMDSMICommands():
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule, throttle,
xgmi_err, energy, mem_usage, voltage, schedule, throttle,
)
if self.logger.is_json_format():
self.logger.combine_arrays_to_json()
+2
Просмотреть файл
@@ -874,6 +874,7 @@ class AMDSMIParser(argparse.ArgumentParser):
ecc_help = "Total number of ECC errors"
ecc_blocks_help = "Number of ECC errors per block"
pcie_help = "Current PCIe speed, width, and replay count"
voltage_help = "GPU voltage"
# Help text for Arguments only on Linux Baremetal platforms
fan_help = "Current fan speed"
@@ -939,6 +940,7 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help)
# Options that only apply to Hypervisors and Baremetal Linux
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
+1
Просмотреть файл
@@ -313,6 +313,7 @@ Metric arguments:
-P, --pcie Current PCIe speed, width, and replay count
-e, --ecc Total number of ECC errors
-k, --ecc-blocks Number of ECC errors per block
-V, --voltage GPU voltage
-f, --fan Current fan speed
-C, --voltage-curve Display voltage curve
-o, --overdrive Current GPU clock overdrive and GPU memory clock overdrive level
+11 -7
Просмотреть файл
@@ -2112,16 +2112,17 @@ machine guest
Input parameters:
* `processor_handle` handle for the given device
* `sensor_type` part of device from which voltage should be obtained
* `metric` enum indicated which voltage value should be retrieved
Parameters | Description
---|---
`processor_handle` | Handle for the given device
`sensor_type` | <table><thead><tr><th> Possible Values </th><th> Description </th></tr></thead><tbody><tr><td>`AmdSmiVoltageType.VDDGFX`</td><td>Represents the voltage supplied to the GPU's graphics core.</td></tr><tr><td>`AmdSmiVoltageType.VDDBOARD`</td><td>Represents the voltage supplied to the entire GPU board, including auxiliary components. Intended for Mi300+</td></tr></tbody></table>
`metric` | <table><thead><tr><th> Possible Values </th><th> Description </th></tr></thead><tbody><tr><td>`AmdSmiVoltageMetric.CURRENT`</td><td>Represents the current voltage value measured at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.MAX`</td><td>Represents the maximum voltage value recorded at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.MIN`</td><td>Represents the minimum voltage value recorded at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.AVERAGE`</td><td>Represents the average voltage value calculated over a period of time at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.MAX_CRIT`</td><td>Represents the critical maximum voltage value that should not be exceeded.</td></tr><tr><td>`AmdSmiVoltageMetric.MIN_CRIT`</td><td>Represents the critical minimum voltage value that should not be dropped below.</td></tr><tr><td>`AmdSmiVoltageMetric.LOWEST`</td><td>Represents the lowest voltage value recorded during the monitoring period.</td></tr><tr><td>`AmdSmiVoltageMetric.HIGHEST`</td><td>Represents the highest voltage value recorded during the monitoring period.</td></tr></tbody></table>
Output: Voltage as integer in millivolts
Exceptions that can be thrown by `amdsmi_get_gpu_volt_metric` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
@@ -2133,8 +2134,11 @@ try:
print("No GPUs on machine")
else:
for device in devices:
voltage = amdsmi_get_gpu_volt_metric(device, AmdSmiVoltageType.VDDGFX,
AmdSmiVoltageMetric.AVERAGE)
voltage = amdsmi_get_gpu_volt_metric(
device,
AmdSmiVoltageType.VDDBOARD,
AmdSmiVoltageMetric.AVERAGE
)
print(voltage)
except AmdSmiException as e:
print(e)
@@ -2692,7 +2696,7 @@ except AmdSmiException as e:
### amdsmi_get_gpu_power_profile_presets
Description: Get the list of available preset power profiles and an indication of
Description: Get the list of available preset power profiles and an indication of
which profile is currently active. It is not supported on virtual machine guest
Input parameters:
+2 -1
Просмотреть файл
@@ -1360,7 +1360,8 @@ typedef enum {
AMDSMI_VOLT_TYPE_FIRST = 0,
AMDSMI_VOLT_TYPE_VDDGFX = AMDSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage
AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDGFX,
AMDSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD
AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDBOARD,
AMDSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} amdsmi_voltage_type_t;
+1
Просмотреть файл
@@ -309,6 +309,7 @@ class AmdSmiVoltageMetric(IntEnum):
class AmdSmiVoltageType(IntEnum):
VDDGFX = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDGFX
VDDBOARD = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDBOARD
INVALID = amdsmi_wrapper.AMDSMI_VOLT_TYPE_INVALID
class AmdSmiAcceleratorPartitionResourceType(IntEnum):
+5 -2
Просмотреть файл
@@ -1530,12 +1530,14 @@ amdsmi_voltage_metric_t = ctypes.c_uint32 # enum
amdsmi_voltage_type_t__enumvalues = {
0: 'AMDSMI_VOLT_TYPE_FIRST',
0: 'AMDSMI_VOLT_TYPE_VDDGFX',
0: 'AMDSMI_VOLT_TYPE_LAST',
1: 'AMDSMI_VOLT_TYPE_VDDBOARD',
1: 'AMDSMI_VOLT_TYPE_LAST',
4294967295: 'AMDSMI_VOLT_TYPE_INVALID',
}
AMDSMI_VOLT_TYPE_FIRST = 0
AMDSMI_VOLT_TYPE_VDDGFX = 0
AMDSMI_VOLT_TYPE_LAST = 0
AMDSMI_VOLT_TYPE_VDDBOARD = 1
AMDSMI_VOLT_TYPE_LAST = 1
AMDSMI_VOLT_TYPE_INVALID = 4294967295
amdsmi_voltage_type_t = ctypes.c_uint32 # enum
@@ -3161,6 +3163,7 @@ __all__ = \
'AMDSMI_VOLT_MAX_CRIT', 'AMDSMI_VOLT_MIN', 'AMDSMI_VOLT_MIN_CRIT',
'AMDSMI_VOLT_TYPE_FIRST', 'AMDSMI_VOLT_TYPE_INVALID',
'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDGFX',
'AMDSMI_VOLT_TYPE_VDDBOARD',
'AMDSMI_VRAM_TYPE_DDR2', 'AMDSMI_VRAM_TYPE_DDR3',
'AMDSMI_VRAM_TYPE_DDR4', 'AMDSMI_VRAM_TYPE_GDDR1',
'AMDSMI_VRAM_TYPE_GDDR2', 'AMDSMI_VRAM_TYPE_GDDR3',
+4 -3
Просмотреть файл
@@ -574,9 +574,10 @@ typedef enum {
typedef enum {
RSMI_VOLT_TYPE_FIRST = 0,
RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU
//!< voltage
RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX,
RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage
RSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD
RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDBOARD,
RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} rsmi_voltage_type_t;
+9 -2
Просмотреть файл
@@ -88,7 +88,7 @@ static const char *kTempSensorTypeJunctionName = "junction";
static const char *kTempSensorTypeEdgeName = "edge";
static const char *kTempSensorTypeVddgfxName = "vddgfx";
static const char *kTempSensorTypeVddboardName = "vddboard";
static const std::map<std::string, rsmi_temperature_type_t>
kTempSensorNameMap = {
@@ -100,6 +100,7 @@ static const std::map<std::string, rsmi_temperature_type_t>
static const std::map<std::string, rsmi_voltage_type_t>
kVoltSensorNameMap = {
{kTempSensorTypeVddgfxName, RSMI_VOLT_TYPE_VDDGFX},
{kTempSensorTypeVddboardName, RSMI_VOLT_TYPE_VDDBOARD},
};
static const std::map<MonitorTypes, const char *> kMonitorNameMap = {
@@ -378,7 +379,13 @@ Monitor::setVoltSensorLabelMap(void) {
};
for (uint32_t i = 0; i < RSMI_VOLT_TYPE_LAST + 1; ++i) {
ret = add_volt_sensor_entry(i);
// VDDGFX -> 0, VDDNB -> 1, VDDBOARD -> 2
// Here the VDDNB will be skipped as it is not defined in the enum and not supported by AMD.
auto file_index = i;
if (i >= RSMI_VOLT_TYPE_VDDBOARD) {
file_index = i + 1;
}
ret = add_volt_sensor_entry(file_index);
if (ret) {
return ret;
}
+1
Просмотреть файл
@@ -123,6 +123,7 @@ static const char* short_options = "i:v:m:fr";
static const std::map<uint32_t, std::string> kVoltSensorNameMap = {
{AMDSMI_VOLT_TYPE_VDDGFX, "Vddgfx"},
{AMDSMI_VOLT_TYPE_VDDBOARD, "Vddboard"},
};
static void PrintHelp(void) {