[SWDEV-520665] Add support for board voltage (#303)
* Add the API and CLI to show the board voltage. --------- Change-Id: Icb25bd653bb1d004704b5a21b378ca31b2b242c7 Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com> Signed-off-by: AL Musaffar, Yazen <Yazen.ALMusaffar@amd.com>
This commit is contained in:
کامیت شده توسط
GitHub
والد
13148c5d8e
کامیت
970560fc7c
@@ -229,6 +229,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
- Increasing available JPEG engines to 40.
|
||||
Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI.
|
||||
|
||||
- **Added support to get GPU voltage**.
|
||||
```shell
|
||||
$ amd-smi metric --voltage
|
||||
GPU: 0
|
||||
VOLTAGE:
|
||||
VDDBOARD: 52536 mV
|
||||
...
|
||||
```
|
||||
- **Added bad page threshold count**.
|
||||
- Added `amdsmi_get_gpu_bad_page_threshold` to Python API and CLI; root/sudo permissions required to display the count.
|
||||
|
||||
|
||||
@@ -1383,7 +1383,7 @@ class AMDSMICommands():
|
||||
usage=None, watch=None, watch_time=None, iterations=None, power=None,
|
||||
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
|
||||
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
|
||||
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
|
||||
xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None,
|
||||
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None,
|
||||
):
|
||||
"""Get Metric information for target gpu
|
||||
@@ -1410,6 +1410,7 @@ class AMDSMICommands():
|
||||
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
|
||||
energy (bool, optional): Value override for args.energy. Defaults to None.
|
||||
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
|
||||
voltage (bool, optional): Value override for args.voltage. Defaults to None.
|
||||
schedule (bool, optional): Value override for args.schedule. Defaults to None.
|
||||
guard (bool, optional): Value override for args.guard. Defaults to None.
|
||||
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
|
||||
@@ -1458,9 +1459,9 @@ class AMDSMICommands():
|
||||
args.ecc = ecc
|
||||
if ecc_blocks:
|
||||
args.ecc_blocks = ecc_blocks
|
||||
current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"]
|
||||
current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks"]
|
||||
current_platform_values += [args.usage, args.power, args.clock,
|
||||
args.temperature, args.pcie]
|
||||
args.temperature, args.voltage, args.pcie]
|
||||
current_platform_values += [args.ecc, args.ecc_blocks]
|
||||
|
||||
if self.helpers.is_baremetal() and self.helpers.is_linux():
|
||||
@@ -2385,6 +2386,22 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['xgmi_err'] = "N/A"
|
||||
logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
if "voltage" in current_platform_args:
|
||||
if args.voltage:
|
||||
voltage_dict = {}
|
||||
all_voltage = {
|
||||
"vddboard": amdsmi_interface.AmdSmiVoltageType.VDDBOARD
|
||||
}
|
||||
for volt_type, volt_metric in all_voltage.items():
|
||||
try:
|
||||
voltage = amdsmi_interface.amdsmi_get_gpu_volt_metric(args.gpu, volt_metric, amdsmi_interface.AmdSmiVoltageMetric.CURRENT)
|
||||
if voltage == 0:
|
||||
voltage = "N/A"
|
||||
voltage_dict[volt_type] = self.helpers.unit_format(self.logger, voltage, "mV")
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
voltage_dict[volt_type] = "N/A"
|
||||
logging.debug("Failed to get voltage for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
values_dict['voltage'] = voltage_dict
|
||||
if "energy" in current_platform_args:
|
||||
if args.energy:
|
||||
try:
|
||||
@@ -2991,7 +3008,7 @@ class AMDSMICommands():
|
||||
usage=None, watch=None, watch_time=None, iterations=None, power=None,
|
||||
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
|
||||
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
|
||||
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
|
||||
xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None,
|
||||
guard=None, guest_data=None, fb_usage=None, xgmi=None,
|
||||
cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None,
|
||||
cpu_c0_res=None, cpu_lclk_dpm_level=None, cpu_pwr_svi_telemetry_rails=None,
|
||||
@@ -3025,6 +3042,7 @@ class AMDSMICommands():
|
||||
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
|
||||
energy (bool, optional): Value override for args.energy. Defaults to None.
|
||||
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
|
||||
voltage (bool, optional): Value override for args.voltage. Defaults to None.
|
||||
schedule (bool, optional): Value override for args.schedule. Defaults to None.
|
||||
guard (bool, optional): Value override for args.guard. Defaults to None.
|
||||
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
|
||||
@@ -3073,7 +3091,7 @@ class AMDSMICommands():
|
||||
gpu_args_enabled = False
|
||||
gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock",
|
||||
"temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve",
|
||||
"overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule",
|
||||
"overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "voltage", "schedule",
|
||||
"guard", "guest_data", "fb_usage", "xgmi", "throttle"]
|
||||
for attr in gpu_attributes:
|
||||
if hasattr(args, attr):
|
||||
@@ -3146,7 +3164,7 @@ class AMDSMICommands():
|
||||
usage, watch, watch_time, iterations, power,
|
||||
clock, temperature, ecc, ecc_blocks, pcie,
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, schedule,
|
||||
xgmi_err, energy, mem_usage, voltage, schedule,
|
||||
guard, guest_data, fb_usage, xgmi, throttle,
|
||||
)
|
||||
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
|
||||
@@ -3182,7 +3200,7 @@ class AMDSMICommands():
|
||||
usage, watch, watch_time, iterations, power,
|
||||
clock, temperature, ecc, ecc_blocks, pcie,
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, schedule, throttle,
|
||||
xgmi_err, energy, mem_usage, voltage, schedule, throttle,
|
||||
)
|
||||
if self.logger.is_json_format():
|
||||
self.logger.combine_arrays_to_json()
|
||||
|
||||
@@ -874,6 +874,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
ecc_help = "Total number of ECC errors"
|
||||
ecc_blocks_help = "Number of ECC errors per block"
|
||||
pcie_help = "Current PCIe speed, width, and replay count"
|
||||
voltage_help = "GPU voltage"
|
||||
|
||||
# Help text for Arguments only on Linux Baremetal platforms
|
||||
fan_help = "Current fan speed"
|
||||
@@ -939,6 +940,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
|
||||
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
|
||||
metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help)
|
||||
|
||||
# Options that only apply to Hypervisors and Baremetal Linux
|
||||
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
|
||||
|
||||
@@ -313,6 +313,7 @@ Metric arguments:
|
||||
-P, --pcie Current PCIe speed, width, and replay count
|
||||
-e, --ecc Total number of ECC errors
|
||||
-k, --ecc-blocks Number of ECC errors per block
|
||||
-V, --voltage GPU voltage
|
||||
-f, --fan Current fan speed
|
||||
-C, --voltage-curve Display voltage curve
|
||||
-o, --overdrive Current GPU clock overdrive and GPU memory clock overdrive level
|
||||
|
||||
@@ -2112,16 +2112,17 @@ machine guest
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` handle for the given device
|
||||
* `sensor_type` part of device from which voltage should be obtained
|
||||
* `metric` enum indicated which voltage value should be retrieved
|
||||
Parameters | Description
|
||||
---|---
|
||||
`processor_handle` | Handle for the given device
|
||||
`sensor_type` | <table><thead><tr><th> Possible Values </th><th> Description </th></tr></thead><tbody><tr><td>`AmdSmiVoltageType.VDDGFX`</td><td>Represents the voltage supplied to the GPU's graphics core.</td></tr><tr><td>`AmdSmiVoltageType.VDDBOARD`</td><td>Represents the voltage supplied to the entire GPU board, including auxiliary components. Intended for Mi300+</td></tr></tbody></table>
|
||||
`metric` | <table><thead><tr><th> Possible Values </th><th> Description </th></tr></thead><tbody><tr><td>`AmdSmiVoltageMetric.CURRENT`</td><td>Represents the current voltage value measured at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.MAX`</td><td>Represents the maximum voltage value recorded at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.MIN`</td><td>Represents the minimum voltage value recorded at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.AVERAGE`</td><td>Represents the average voltage value calculated over a period of time at the specified sensor.</td></tr><tr><td>`AmdSmiVoltageMetric.MAX_CRIT`</td><td>Represents the critical maximum voltage value that should not be exceeded.</td></tr><tr><td>`AmdSmiVoltageMetric.MIN_CRIT`</td><td>Represents the critical minimum voltage value that should not be dropped below.</td></tr><tr><td>`AmdSmiVoltageMetric.LOWEST`</td><td>Represents the lowest voltage value recorded during the monitoring period.</td></tr><tr><td>`AmdSmiVoltageMetric.HIGHEST`</td><td>Represents the highest voltage value recorded during the monitoring period.</td></tr></tbody></table>
|
||||
|
||||
Output: Voltage as integer in millivolts
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_volt_metric` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiRetryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
@@ -2133,8 +2134,11 @@ try:
|
||||
print("No GPUs on machine")
|
||||
else:
|
||||
for device in devices:
|
||||
voltage = amdsmi_get_gpu_volt_metric(device, AmdSmiVoltageType.VDDGFX,
|
||||
AmdSmiVoltageMetric.AVERAGE)
|
||||
voltage = amdsmi_get_gpu_volt_metric(
|
||||
device,
|
||||
AmdSmiVoltageType.VDDBOARD,
|
||||
AmdSmiVoltageMetric.AVERAGE
|
||||
)
|
||||
print(voltage)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
@@ -2692,7 +2696,7 @@ except AmdSmiException as e:
|
||||
|
||||
### amdsmi_get_gpu_power_profile_presets
|
||||
|
||||
Description: Get the list of available preset power profiles and an indication of
|
||||
Description: Get the list of available preset power profiles and an indication of
|
||||
which profile is currently active. It is not supported on virtual machine guest
|
||||
|
||||
Input parameters:
|
||||
|
||||
@@ -1360,7 +1360,8 @@ typedef enum {
|
||||
AMDSMI_VOLT_TYPE_FIRST = 0,
|
||||
|
||||
AMDSMI_VOLT_TYPE_VDDGFX = AMDSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage
|
||||
AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDGFX,
|
||||
AMDSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD
|
||||
AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDBOARD,
|
||||
AMDSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
|
||||
} amdsmi_voltage_type_t;
|
||||
|
||||
|
||||
@@ -309,6 +309,7 @@ class AmdSmiVoltageMetric(IntEnum):
|
||||
|
||||
class AmdSmiVoltageType(IntEnum):
|
||||
VDDGFX = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDGFX
|
||||
VDDBOARD = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDBOARD
|
||||
INVALID = amdsmi_wrapper.AMDSMI_VOLT_TYPE_INVALID
|
||||
|
||||
class AmdSmiAcceleratorPartitionResourceType(IntEnum):
|
||||
|
||||
@@ -1530,12 +1530,14 @@ amdsmi_voltage_metric_t = ctypes.c_uint32 # enum
|
||||
amdsmi_voltage_type_t__enumvalues = {
|
||||
0: 'AMDSMI_VOLT_TYPE_FIRST',
|
||||
0: 'AMDSMI_VOLT_TYPE_VDDGFX',
|
||||
0: 'AMDSMI_VOLT_TYPE_LAST',
|
||||
1: 'AMDSMI_VOLT_TYPE_VDDBOARD',
|
||||
1: 'AMDSMI_VOLT_TYPE_LAST',
|
||||
4294967295: 'AMDSMI_VOLT_TYPE_INVALID',
|
||||
}
|
||||
AMDSMI_VOLT_TYPE_FIRST = 0
|
||||
AMDSMI_VOLT_TYPE_VDDGFX = 0
|
||||
AMDSMI_VOLT_TYPE_LAST = 0
|
||||
AMDSMI_VOLT_TYPE_VDDBOARD = 1
|
||||
AMDSMI_VOLT_TYPE_LAST = 1
|
||||
AMDSMI_VOLT_TYPE_INVALID = 4294967295
|
||||
amdsmi_voltage_type_t = ctypes.c_uint32 # enum
|
||||
|
||||
@@ -3161,6 +3163,7 @@ __all__ = \
|
||||
'AMDSMI_VOLT_MAX_CRIT', 'AMDSMI_VOLT_MIN', 'AMDSMI_VOLT_MIN_CRIT',
|
||||
'AMDSMI_VOLT_TYPE_FIRST', 'AMDSMI_VOLT_TYPE_INVALID',
|
||||
'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDGFX',
|
||||
'AMDSMI_VOLT_TYPE_VDDBOARD',
|
||||
'AMDSMI_VRAM_TYPE_DDR2', 'AMDSMI_VRAM_TYPE_DDR3',
|
||||
'AMDSMI_VRAM_TYPE_DDR4', 'AMDSMI_VRAM_TYPE_GDDR1',
|
||||
'AMDSMI_VRAM_TYPE_GDDR2', 'AMDSMI_VRAM_TYPE_GDDR3',
|
||||
|
||||
@@ -574,9 +574,10 @@ typedef enum {
|
||||
typedef enum {
|
||||
RSMI_VOLT_TYPE_FIRST = 0,
|
||||
|
||||
RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU
|
||||
//!< voltage
|
||||
RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX,
|
||||
RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST, //!< Vddgfx GPU voltage
|
||||
RSMI_VOLT_TYPE_VDDBOARD, //!< Voltage for VDDBOARD
|
||||
|
||||
RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDBOARD,
|
||||
RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
|
||||
} rsmi_voltage_type_t;
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ static const char *kTempSensorTypeJunctionName = "junction";
|
||||
static const char *kTempSensorTypeEdgeName = "edge";
|
||||
|
||||
static const char *kTempSensorTypeVddgfxName = "vddgfx";
|
||||
|
||||
static const char *kTempSensorTypeVddboardName = "vddboard";
|
||||
|
||||
static const std::map<std::string, rsmi_temperature_type_t>
|
||||
kTempSensorNameMap = {
|
||||
@@ -100,6 +100,7 @@ static const std::map<std::string, rsmi_temperature_type_t>
|
||||
static const std::map<std::string, rsmi_voltage_type_t>
|
||||
kVoltSensorNameMap = {
|
||||
{kTempSensorTypeVddgfxName, RSMI_VOLT_TYPE_VDDGFX},
|
||||
{kTempSensorTypeVddboardName, RSMI_VOLT_TYPE_VDDBOARD},
|
||||
};
|
||||
|
||||
static const std::map<MonitorTypes, const char *> kMonitorNameMap = {
|
||||
@@ -378,7 +379,13 @@ Monitor::setVoltSensorLabelMap(void) {
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < RSMI_VOLT_TYPE_LAST + 1; ++i) {
|
||||
ret = add_volt_sensor_entry(i);
|
||||
// VDDGFX -> 0, VDDNB -> 1, VDDBOARD -> 2
|
||||
// Here the VDDNB will be skipped as it is not defined in the enum and not supported by AMD.
|
||||
auto file_index = i;
|
||||
if (i >= RSMI_VOLT_TYPE_VDDBOARD) {
|
||||
file_index = i + 1;
|
||||
}
|
||||
ret = add_volt_sensor_entry(file_index);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -123,6 +123,7 @@ static const char* short_options = "i:v:m:fr";
|
||||
|
||||
static const std::map<uint32_t, std::string> kVoltSensorNameMap = {
|
||||
{AMDSMI_VOLT_TYPE_VDDGFX, "Vddgfx"},
|
||||
{AMDSMI_VOLT_TYPE_VDDBOARD, "Vddboard"},
|
||||
};
|
||||
|
||||
static void PrintHelp(void) {
|
||||
|
||||
مرجع در شماره جدید
Block a user