diff --git a/CHANGELOG.md b/CHANGELOG.md
index 536bf7cfdc..cbe903cdf5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -229,6 +229,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- Increasing available JPEG engines to 40.
Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI.
+- **Added support to get GPU voltage**.
+ ```shell
+ $ amd-smi metric --voltage
+ GPU: 0
+ VOLTAGE:
+ VDDBOARD: 52536 mV
+ ...
+ ```
- **Added bad page threshold count**.
- Added `amdsmi_get_gpu_bad_page_threshold` to Python API and CLI; root/sudo permissions required to display the count.
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index 5ea59c0f8f..ffc91c34e1 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -1383,7 +1383,7 @@ class AMDSMICommands():
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
- xgmi_err=None, energy=None, mem_usage=None, schedule=None,
+ xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None,
):
"""Get Metric information for target gpu
@@ -1410,6 +1410,7 @@ class AMDSMICommands():
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
energy (bool, optional): Value override for args.energy. Defaults to None.
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
+ voltage (bool, optional): Value override for args.voltage. Defaults to None.
schedule (bool, optional): Value override for args.schedule. Defaults to None.
guard (bool, optional): Value override for args.guard. Defaults to None.
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
@@ -1458,9 +1459,9 @@ class AMDSMICommands():
args.ecc = ecc
if ecc_blocks:
args.ecc_blocks = ecc_blocks
- current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"]
+ current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks"]
current_platform_values += [args.usage, args.power, args.clock,
- args.temperature, args.pcie]
+ args.temperature, args.voltage, args.pcie]
current_platform_values += [args.ecc, args.ecc_blocks]
if self.helpers.is_baremetal() and self.helpers.is_linux():
@@ -2385,6 +2386,22 @@ class AMDSMICommands():
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['xgmi_err'] = "N/A"
logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info())
+ if "voltage" in current_platform_args:
+ if args.voltage:
+ voltage_dict = {}
+ all_voltage = {
+ "vddboard": amdsmi_interface.AmdSmiVoltageType.VDDBOARD
+ }
+ for volt_type, volt_metric in all_voltage.items():
+ try:
+ voltage = amdsmi_interface.amdsmi_get_gpu_volt_metric(args.gpu, volt_metric, amdsmi_interface.AmdSmiVoltageMetric.CURRENT)
+ if voltage == 0:
+ voltage = "N/A"
+ voltage_dict[volt_type] = self.helpers.unit_format(self.logger, voltage, "mV")
+ except amdsmi_exception.AmdSmiLibraryException as e:
+ voltage_dict[volt_type] = "N/A"
+ logging.debug("Failed to get voltage for gpu %s | %s", gpu_id, e.get_error_info())
+ values_dict['voltage'] = voltage_dict
if "energy" in current_platform_args:
if args.energy:
try:
@@ -2991,7 +3008,7 @@ class AMDSMICommands():
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
- xgmi_err=None, energy=None, mem_usage=None, schedule=None,
+ xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None,
cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None,
cpu_c0_res=None, cpu_lclk_dpm_level=None, cpu_pwr_svi_telemetry_rails=None,
@@ -3025,6 +3042,7 @@ class AMDSMICommands():
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
energy (bool, optional): Value override for args.energy. Defaults to None.
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
+ voltage (bool, optional): Value override for args.voltage. Defaults to None.
schedule (bool, optional): Value override for args.schedule. Defaults to None.
guard (bool, optional): Value override for args.guard. Defaults to None.
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
@@ -3073,7 +3091,7 @@ class AMDSMICommands():
gpu_args_enabled = False
gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock",
"temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve",
- "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule",
+ "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "voltage", "schedule",
"guard", "guest_data", "fb_usage", "xgmi", "throttle"]
for attr in gpu_attributes:
if hasattr(args, attr):
@@ -3146,7 +3164,7 @@ class AMDSMICommands():
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
- xgmi_err, energy, mem_usage, schedule,
+ xgmi_err, energy, mem_usage, voltage, schedule,
guard, guest_data, fb_usage, xgmi, throttle,
)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
@@ -3182,7 +3200,7 @@ class AMDSMICommands():
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
- xgmi_err, energy, mem_usage, schedule, throttle,
+ xgmi_err, energy, mem_usage, voltage, schedule, throttle,
)
if self.logger.is_json_format():
self.logger.combine_arrays_to_json()
diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py
index ba09232b8a..95ed98eda3 100644
--- a/amdsmi_cli/amdsmi_parser.py
+++ b/amdsmi_cli/amdsmi_parser.py
@@ -874,6 +874,7 @@ class AMDSMIParser(argparse.ArgumentParser):
ecc_help = "Total number of ECC errors"
ecc_blocks_help = "Number of ECC errors per block"
pcie_help = "Current PCIe speed, width, and replay count"
+ voltage_help = "GPU voltage"
# Help text for Arguments only on Linux Baremetal platforms
fan_help = "Current fan speed"
@@ -939,6 +940,7 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
+ metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help)
# Options that only apply to Hypervisors and Baremetal Linux
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
diff --git a/docs/how-to/amdsmi-cli-tool.md b/docs/how-to/amdsmi-cli-tool.md
index 9f5d7e519e..db1b9618c9 100644
--- a/docs/how-to/amdsmi-cli-tool.md
+++ b/docs/how-to/amdsmi-cli-tool.md
@@ -313,6 +313,7 @@ Metric arguments:
-P, --pcie Current PCIe speed, width, and replay count
-e, --ecc Total number of ECC errors
-k, --ecc-blocks Number of ECC errors per block
+ -V, --voltage GPU voltage
-f, --fan Current fan speed
-C, --voltage-curve Display voltage curve
-o, --overdrive Current GPU clock overdrive and GPU memory clock overdrive level
diff --git a/docs/reference/amdsmi-py-api.md b/docs/reference/amdsmi-py-api.md
index e4f17b477c..9f0b2bbbab 100644
--- a/docs/reference/amdsmi-py-api.md
+++ b/docs/reference/amdsmi-py-api.md
@@ -2112,16 +2112,17 @@ machine guest
Input parameters:
-* `processor_handle` handle for the given device
-* `sensor_type` part of device from which voltage should be obtained
-* `metric` enum indicated which voltage value should be retrieved
+Parameters | Description
+---|---
+`processor_handle` | Handle for the given device
+`sensor_type` |
| Possible Values | Description |
|---|
| `AmdSmiVoltageType.VDDGFX` | Represents the voltage supplied to the GPU's graphics core. |
| `AmdSmiVoltageType.VDDBOARD` | Represents the voltage supplied to the entire GPU board, including auxiliary components. Intended for Mi300+ |
+`metric` | | Possible Values | Description |
|---|
| `AmdSmiVoltageMetric.CURRENT` | Represents the current voltage value measured at the specified sensor. |
| `AmdSmiVoltageMetric.MAX` | Represents the maximum voltage value recorded at the specified sensor. |
| `AmdSmiVoltageMetric.MIN` | Represents the minimum voltage value recorded at the specified sensor. |
| `AmdSmiVoltageMetric.AVERAGE` | Represents the average voltage value calculated over a period of time at the specified sensor. |
| `AmdSmiVoltageMetric.MAX_CRIT` | Represents the critical maximum voltage value that should not be exceeded. |
| `AmdSmiVoltageMetric.MIN_CRIT` | Represents the critical minimum voltage value that should not be dropped below. |
| `AmdSmiVoltageMetric.LOWEST` | Represents the lowest voltage value recorded during the monitoring period. |
| `AmdSmiVoltageMetric.HIGHEST` | Represents the highest voltage value recorded during the monitoring period. |
Output: Voltage as integer in millivolts
Exceptions that can be thrown by `amdsmi_get_gpu_volt_metric` function:
* `AmdSmiLibraryException`
-* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
@@ -2133,8 +2134,11 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- voltage = amdsmi_get_gpu_volt_metric(device, AmdSmiVoltageType.VDDGFX,
- AmdSmiVoltageMetric.AVERAGE)
+ voltage = amdsmi_get_gpu_volt_metric(
+ device,
+ AmdSmiVoltageType.VDDBOARD,
+ AmdSmiVoltageMetric.AVERAGE
+ )
print(voltage)
except AmdSmiException as e:
print(e)
@@ -2692,7 +2696,7 @@ except AmdSmiException as e:
### amdsmi_get_gpu_power_profile_presets
-Description: Get the list of available preset power profiles and an indication of
+Description: Get the list of available preset power profiles and an indication of
which profile is currently active. It is not supported on virtual machine guest
Input parameters:
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index e6fb728776..2ad4a30e2a 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -1360,7 +1360,8 @@ typedef enum {
AMDSMI_VOLT_TYPE_FIRST = 0,
AMDSMI_VOLT_TYPE_VDDGFX = AMDSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage
- AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDGFX,
+ AMDSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD
+ AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDBOARD,
AMDSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} amdsmi_voltage_type_t;
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index 32f038ac2d..fc942257b0 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -309,6 +309,7 @@ class AmdSmiVoltageMetric(IntEnum):
class AmdSmiVoltageType(IntEnum):
VDDGFX = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDGFX
+ VDDBOARD = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDBOARD
INVALID = amdsmi_wrapper.AMDSMI_VOLT_TYPE_INVALID
class AmdSmiAcceleratorPartitionResourceType(IntEnum):
diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
index 0132aa0522..c915ba8467 100644
--- a/py-interface/amdsmi_wrapper.py
+++ b/py-interface/amdsmi_wrapper.py
@@ -1530,12 +1530,14 @@ amdsmi_voltage_metric_t = ctypes.c_uint32 # enum
amdsmi_voltage_type_t__enumvalues = {
0: 'AMDSMI_VOLT_TYPE_FIRST',
0: 'AMDSMI_VOLT_TYPE_VDDGFX',
- 0: 'AMDSMI_VOLT_TYPE_LAST',
+ 1: 'AMDSMI_VOLT_TYPE_VDDBOARD',
+ 1: 'AMDSMI_VOLT_TYPE_LAST',
4294967295: 'AMDSMI_VOLT_TYPE_INVALID',
}
AMDSMI_VOLT_TYPE_FIRST = 0
AMDSMI_VOLT_TYPE_VDDGFX = 0
-AMDSMI_VOLT_TYPE_LAST = 0
+AMDSMI_VOLT_TYPE_VDDBOARD = 1
+AMDSMI_VOLT_TYPE_LAST = 1
AMDSMI_VOLT_TYPE_INVALID = 4294967295
amdsmi_voltage_type_t = ctypes.c_uint32 # enum
@@ -3161,6 +3163,7 @@ __all__ = \
'AMDSMI_VOLT_MAX_CRIT', 'AMDSMI_VOLT_MIN', 'AMDSMI_VOLT_MIN_CRIT',
'AMDSMI_VOLT_TYPE_FIRST', 'AMDSMI_VOLT_TYPE_INVALID',
'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDGFX',
+ 'AMDSMI_VOLT_TYPE_VDDBOARD',
'AMDSMI_VRAM_TYPE_DDR2', 'AMDSMI_VRAM_TYPE_DDR3',
'AMDSMI_VRAM_TYPE_DDR4', 'AMDSMI_VRAM_TYPE_GDDR1',
'AMDSMI_VRAM_TYPE_GDDR2', 'AMDSMI_VRAM_TYPE_GDDR3',
diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h
index e73a2d3ec0..2b4476e0c0 100644
--- a/rocm_smi/include/rocm_smi/rocm_smi.h
+++ b/rocm_smi/include/rocm_smi/rocm_smi.h
@@ -574,9 +574,10 @@ typedef enum {
typedef enum {
RSMI_VOLT_TYPE_FIRST = 0,
- RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU
- //!< voltage
- RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX,
+ RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage
+ RSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD
+
+ RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDBOARD,
RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} rsmi_voltage_type_t;
diff --git a/rocm_smi/src/rocm_smi_monitor.cc b/rocm_smi/src/rocm_smi_monitor.cc
index 85c204dd6c..cb6b36dd01 100644
--- a/rocm_smi/src/rocm_smi_monitor.cc
+++ b/rocm_smi/src/rocm_smi_monitor.cc
@@ -88,7 +88,7 @@ static const char *kTempSensorTypeJunctionName = "junction";
static const char *kTempSensorTypeEdgeName = "edge";
static const char *kTempSensorTypeVddgfxName = "vddgfx";
-
+static const char *kTempSensorTypeVddboardName = "vddboard";
static const std::map
kTempSensorNameMap = {
@@ -100,6 +100,7 @@ static const std::map
static const std::map
kVoltSensorNameMap = {
{kTempSensorTypeVddgfxName, RSMI_VOLT_TYPE_VDDGFX},
+ {kTempSensorTypeVddboardName, RSMI_VOLT_TYPE_VDDBOARD},
};
static const std::map kMonitorNameMap = {
@@ -378,7 +379,13 @@ Monitor::setVoltSensorLabelMap(void) {
};
for (uint32_t i = 0; i < RSMI_VOLT_TYPE_LAST + 1; ++i) {
- ret = add_volt_sensor_entry(i);
+ // VDDGFX -> 0, VDDNB -> 1, VDDBOARD -> 2
+ // Here the VDDNB will be skipped as it is not defined in the enum and not supported by AMD.
+ auto file_index = i;
+ if (i >= RSMI_VOLT_TYPE_VDDBOARD) {
+ file_index = i + 1;
+ }
+ ret = add_volt_sensor_entry(file_index);
if (ret) {
return ret;
}
diff --git a/tests/amd_smi_test/test_common.cc b/tests/amd_smi_test/test_common.cc
index f565d0e525..58a313cb9f 100644
--- a/tests/amd_smi_test/test_common.cc
+++ b/tests/amd_smi_test/test_common.cc
@@ -123,6 +123,7 @@ static const char* short_options = "i:v:m:fr";
static const std::map kVoltSensorNameMap = {
{AMDSMI_VOLT_TYPE_VDDGFX, "Vddgfx"},
+ {AMDSMI_VOLT_TYPE_VDDBOARD, "Vddboard"},
};
static void PrintHelp(void) {