[SWDEV-532769] amd-smi APIs mismatch with documentation (#428)

* Populated socket_power to get power info
---------

Signed-off-by: josnarlo <Joseph.Narlo@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
This commit is contained in:
Narlo, Joseph
2025-06-03 17:12:13 -05:00
committed by GitHub
parent 8f943b03e1
commit ce7d6dfe61
6 changed files with 79 additions and 67 deletions
+11
View File
@@ -88,6 +88,11 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- **Added `amdsmi_get_cpu_affinity_with_scope()`**.
- **Added `socket power` to `amdsmi_get_power_info`**
- Previously the C API had the value in the `amdsmi_power_info` structure, but was unused
- Now we populate the value in both C & Python APIs
- The value is representative of the socket's power agnostic of the the GPU version.
### Changed
- **Padded `asic_serial` in `amdsmi_get_asic_info` with 0s**.
@@ -162,6 +167,12 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- **Removed `amdsmi_io_link_type_t` and replaced with amdsmi_link_type_t**.
- The IO Link type is no longer needed as the link type is sufficient.
- Mapping from amdsmi_io_link_type_t to amdsmi_link_type_t is as follows:
```shell
AMDSMI_IOLINK_TYPE_UNDEFINED == AMDSMI_LINK_TYPE_INTERNAL
AMDSMI_IOLINK_TYPE_PCIEXPRESS == AMDSMI_LINK_TYPE_PCIE
AMDSMI_IOLINK_TYPE_XGMI == AMDSMI_LINK_TYPE_XGMI
```
- **Removed `amdsmi_get_power_info_v2()`**.
- The amdsmi_get_power_info() has been unified and the v2 function is no longer needed/used.
+7 -13
View File
@@ -1844,22 +1844,16 @@ class AMDSMICommands():
power_unit = "W"
power_info = amdsmi_interface.amdsmi_get_power_info(args.gpu)
for key, value in power_info.items():
if value == 0xFFFF:
power_info[key] = "N/A"
elif "voltage" in key:
if "voltage" in key:
power_info[key] = self.helpers.unit_format(self.logger,
value,
voltage_unit)
elif "power" in key:
if ((key == "current_socket_power" or key == "average_socket_power")
and value != "N/A"):
power_dict['socket_power'] = self.helpers.unit_format(self.logger,
value,
power_unit)
value,
voltage_unit)
elif key == "socket_power":
power_info[key] = self.helpers.unit_format(self.logger,
value,
power_unit)
value,
power_unit)
power_dict['socket_power'] = power_info['socket_power']
power_dict['gfx_voltage'] = power_info['gfx_voltage']
power_dict['soc_voltage'] = power_info['soc_voltage']
power_dict['mem_voltage'] = power_info['mem_voltage']
+22 -24
View File
@@ -460,12 +460,12 @@ try:
print("No GPUs on machine")
else:
for device in devices:
power_info = amdsmi_get_power_cap_info(device)
print(power_info['power_cap'])
print(power_info['dpm_cap'])
print(power_info['default_power_cap'])
print(power_info['min_power_cap'])
print(power_info['max_power_cap'])
power_cap_info = amdsmi_get_power_cap_info(device)
print(power_cap_info['power_cap'])
print(power_cap_info['dpm_cap'])
print(power_cap_info['default_power_cap'])
print(power_cap_info['min_power_cap'])
print(power_cap_info['max_power_cap'])
except AmdSmiException as e:
print(e)
```
@@ -736,18 +736,18 @@ It is not supported on virtual machine guest
Input parameters:
* `processor_handle` device which to query
* `sensor_ind` optional argument that defaults to 0
Output: Dictionary with fields
Field | Description
---|---
`current_socket_power` | current socket power; Mi300+ Series Cards
`average_socket_power` | average socket power; Navi + Mi 200 and earlier Series cards
`gfx_voltage` | voltage gfx
`soc_voltage` | voltage soc
`mem_voltage` | voltage mem
`power_limit` | power limit
Field | Description | Units
---|---|---
`socket_power` | socket power; matches current or average socket power | W
`current_socket_power` | current socket power; Mi300+ Series Cards | W
`average_socket_power` | average socket power; Navi + Mi 200 and earlier Series cards | W
`gfx_voltage` | voltage gfx | mV
`soc_voltage` | voltage soc | mV
`mem_voltage` | voltage mem | mV
`power_limit` | power limit | W
Exceptions that can be thrown by `amdsmi_get_power_info` function:
@@ -764,15 +764,13 @@ try:
print("No GPUs on machine")
else:
for device in devices:
power_measure = amdsmi_get_power_info(device)
# Example with using sensor_ind
# power_measure = amdsmi_get_power_info(device, 0)
print(power_measure['current_socket_power'])
print(power_measure['average_socket_power'])
print(power_measure['gfx_voltage'])
print(power_measure['soc_voltage'])
print(power_measure['mem_voltage'])
print(power_measure['power_limit'])
power_info = amdsmi_get_power_info(device)
print(power_info['current_socket_power'])
print(power_info['average_socket_power'])
print(power_info['gfx_voltage'])
print(power_info['soc_voltage'])
print(power_info['mem_voltage'])
print(power_info['power_limit'])
except AmdSmiException as e:
print(e)
```
+16 -15
View File
@@ -352,19 +352,20 @@ typedef enum {
* @cond @tag{gpu_bm_linux} @tag{host} @tag{guest_windows} @endcond
*/
typedef enum {
AMDSMI_CLK_TYPE_SYS = 0x0, //!< System clock
AMDSMI_CLK_TYPE_SYS = 0x0, //!< Graphics clock
AMDSMI_CLK_TYPE_FIRST = AMDSMI_CLK_TYPE_SYS,
AMDSMI_CLK_TYPE_GFX = AMDSMI_CLK_TYPE_SYS,
AMDSMI_CLK_TYPE_DF, //!< Data Fabric clock (for ASICs
//!< running on a separate clock)
AMDSMI_CLK_TYPE_DCEF, //!< Display Controller Engine clock
AMDSMI_CLK_TYPE_SOC,
AMDSMI_CLK_TYPE_MEM,
AMDSMI_CLK_TYPE_PCIE,
AMDSMI_CLK_TYPE_VCLK0,
AMDSMI_CLK_TYPE_VCLK1,
AMDSMI_CLK_TYPE_DCLK0,
AMDSMI_CLK_TYPE_DCLK1,
AMDSMI_CLK_TYPE_GFX = AMDSMI_CLK_TYPE_SYS, //!< Graphics clock
AMDSMI_CLK_TYPE_DF, /**< Data Fabric clock (for ASICs
running on a separate clock) */
AMDSMI_CLK_TYPE_DCEF, /**< Display Controller Engine Front clock,
timing/bandwidth signals to display */
AMDSMI_CLK_TYPE_SOC, //!< System On Chip clock, integrated circuit frequency
AMDSMI_CLK_TYPE_MEM, //!< Memory clock speed, system operating frequency
AMDSMI_CLK_TYPE_PCIE, //!< PCI Express clock, high bandwidth peripherals
AMDSMI_CLK_TYPE_VCLK0, //!< Video 0 clock, video processing units
AMDSMI_CLK_TYPE_VCLK1, //!< Video 1 clock, video processing units
AMDSMI_CLK_TYPE_DCLK0, //!< Display 1 clock, timing signals for display output
AMDSMI_CLK_TYPE_DCLK1, //!< Display 2 clock, timing signals for display output
AMDSMI_CLK_TYPE__MAX = AMDSMI_CLK_TYPE_DCLK1
} amdsmi_clk_type_t;
@@ -1027,9 +1028,9 @@ typedef struct {
* @cond @tag{gpu_bm_linux} @tag{host} @endcond
*/
typedef struct {
uint64_t socket_power; //!< Units in uW {@host}, Host only
uint32_t current_socket_power; //!< Units in W {@linux_bm}, Linux only, Mi 300+ Series cards
uint32_t average_socket_power; //!< Units in W {@linux_bm}, Linux only, Navi + Mi 200 and earlier Series cards
uint64_t socket_power; //!< Socket power in W {@linux_bm}, uW {@host}
uint32_t current_socket_power; //!< Current socket power in W {@linux_bm}, Linux only, Mi 300+ Series cards
uint32_t average_socket_power; //!< Average socket power in W {@linux_bm}, Linux only, Navi + Mi 200 and earlier Series cards
uint64_t gfx_voltage; //!< GFX voltage measurement in mV {@linux_bm} or V {@host}
uint64_t soc_voltage; //!< SOC voltage measurement in mV {@linux_bm} or V {@host}
uint64_t mem_voltage; //!< MEM voltage measurement in mV {@linux_bm} or V {@host}
+16 -15
View File
@@ -1958,18 +1958,18 @@ def amdsmi_get_power_cap_info(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
power_info = amdsmi_wrapper.amdsmi_power_cap_info_t()
power_cap_info = amdsmi_wrapper.amdsmi_power_cap_info_t()
_check_res(
amdsmi_wrapper.amdsmi_get_power_cap_info(
processor_handle, ctypes.c_uint32(0), ctypes.byref(power_info)
processor_handle, ctypes.c_uint32(0), ctypes.byref(power_cap_info)
)
)
return {"power_cap": power_info.power_cap,
"default_power_cap": power_info.default_power_cap,
"dpm_cap": power_info.dpm_cap,
"min_power_cap": power_info.min_power_cap,
"max_power_cap": power_info.max_power_cap}
return {"power_cap": power_cap_info.power_cap,
"default_power_cap": power_cap_info.default_power_cap,
"dpm_cap": power_cap_info.dpm_cap,
"min_power_cap": power_cap_info.min_power_cap,
"max_power_cap": power_cap_info.max_power_cap}
def amdsmi_get_gpu_pm_metrics_info(
@@ -2733,20 +2733,21 @@ def amdsmi_get_power_info(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
power_measure = amdsmi_wrapper.amdsmi_power_info_t()
power_info = amdsmi_wrapper.amdsmi_power_info_t()
_check_res(
amdsmi_wrapper.amdsmi_get_power_info(
processor_handle, ctypes.byref(power_measure)
processor_handle, ctypes.byref(power_info)
)
)
power_info_dict = {
"current_socket_power": power_measure.current_socket_power,
"average_socket_power": power_measure.average_socket_power,
"gfx_voltage": power_measure.gfx_voltage,
"soc_voltage": power_measure.soc_voltage,
"mem_voltage": power_measure.mem_voltage,
"power_limit" : power_measure.power_limit,
"socket_power": power_info.socket_power,
"current_socket_power": power_info.current_socket_power,
"average_socket_power": power_info.average_socket_power,
"gfx_voltage": power_info.gfx_voltage,
"soc_voltage": power_info.soc_voltage,
"mem_voltage": power_info.mem_voltage,
"power_limit" : power_info.power_limit,
}
for key, value in power_info_dict.items():
+7
View File
@@ -4224,6 +4224,7 @@ amdsmi_get_power_info(amdsmi_processor_handle processor_handle, amdsmi_power_inf
if (status != AMDSMI_STATUS_SUCCESS)
return status;
info->socket_power = 0xFFFF;
info->current_socket_power = 0xFFFF;
info->average_socket_power = 0xFFFF;
info->gfx_voltage = 0xFFFF;
@@ -4241,6 +4242,12 @@ amdsmi_get_power_info(amdsmi_processor_handle processor_handle, amdsmi_power_inf
info->mem_voltage = metrics.voltage_mem;
}
if (metrics.current_socket_power != 0xFFFF) {
info->socket_power = metrics.current_socket_power;
} else if (metrics.average_socket_power != 0xFFFF) {
info->socket_power = metrics.average_socket_power;
}
int power_limit = 0;
status = smi_amdgpu_get_power_cap(gpu_device, &power_limit);
if (status == AMDSMI_STATUS_SUCCESS) {