Get and set the XGMI PLPD

Update the API and CLI to support XGMI Per-Link Power Down Policy.

Change-Id: Iaf04a771eb8bb0829a5b3088d803a7355a8dfd0b


[ROCm/amdsmi commit: e4085c6414]
This commit is contained in:
Bill(Shuzhou) Liu
2024-03-20 12:06:24 -05:00
کامیت شده توسط Maisam Arif
والد 16b0ff1657
کامیت b9b958b82c
11فایلهای تغییر یافته به همراه467 افزوده شده و 50 حذف شده
@@ -280,7 +280,7 @@ usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE
[--core-curr-active-freq-core-limit] [--core-energy]
[--json | --csv] [--file FILE] [--loglevel LEVEL]
If no GPU is specified, returns metric information for all GPUs on the system.
If no GPU is specified, returns metric information for all GPUs on the system.
If no metric argument is provided all metric information will be displayed.
Metric arguments:
@@ -325,16 +325,16 @@ CPU Arguments:
--cpu-c0-res Displays C0 residency
--cpu-lclk-dpm-level NBIOID Displays lclk dpm level range. Requires socket ID and NBOID as inputs
--cpu-pwr-svi-telemtry-rails Displays svi based telemetry for all rails
--cpu-io-bandwidth IO_BW LINKID_NAME Displays current IO bandwidth for the selected CPU.
input parameters are bandwidth type(1) and link ID encodings
--cpu-io-bandwidth IO_BW LINKID_NAME Displays current IO bandwidth for the selected CPU.
input parameters are bandwidth type(1) and link ID encodings
i.e. P2, P3, G0 - G7
--cpu-xgmi-bandwidth XGMI_BW LINKID_NAME Displays current XGMI bandwidth for the selected CPU
input parameters are bandwidth type(1,2,4) and link ID encodings
--cpu-xgmi-bandwidth XGMI_BW LINKID_NAME Displays current XGMI bandwidth for the selected CPU
input parameters are bandwidth type(1,2,4) and link ID encodings
i.e. P2, P3, G0 - G7
--cpu-metrics-ver Displays metrics table version
--cpu-metrics-table Displays metric table
--cpu-socket-energy Displays socket energy for the selected CPU socket
--cpu-ddr-bandwidth Displays per socket max ddr bw, current utilized bw,
--cpu-ddr-bandwidth Displays per socket max ddr bw, current utilized bw,
and current utilized ddr bw in percentage
--cpu-temp Displays cpu socket temperature
--cpu-dimm-temp-range-rate DIMM_ADDR Displays dimm temperature range and refresh rate
@@ -437,7 +437,7 @@ usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
[-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-a]
[-w] [-o] [-t] [-b]
If no GPU is specified, returns information for all GPUs on the system.
If no GPU is specified, returns information for all GPUs on the system.
If no topology argument is provided all topology information will be displayed.
Topology arguments:
@@ -483,7 +483,7 @@ usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...
[--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE]
[--loglevel LEVEL]
A GPU must be specified to set a configuration.
A GPU must be specified to set a configuration.
A set argument must be provided; Multiple set arguments are accepted
Set Arguments:
@@ -513,11 +513,12 @@ Set Arguments:
NPS1, NPS2, NPS4, NPS8
-o, --power-cap WATTS Set power capacity limit
-p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id
-x, --xgmi-plpd POLICY_ID Set the GPU XGMI per-link power down policy using policy id
CPU Arguments:
--cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value.
--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH Set max and Min linkwidth. Input parameters are min and max link width values
--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO.
--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO.
Input parameters are die_index, min dpm, max dpm.
--cpu-pwr-eff-mode MODE Sets the power efficency mode policy. Input parameter is mode.
--cpu-gmi3-link-width MIN_LW MAX_LW Sets max and min gmi3 link width range
@@ -675,7 +676,7 @@ GPU: 0
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
DPM_POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
@@ -687,6 +688,16 @@ GPU: 0
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
XGMI_PLPD:
NUM_SUPPORTED: 3
CURRENT_ID: 1
PLPDS:
POLICY_ID: 0
POLICY_DESCRIPTION: plpd_disallow
POLICY_ID: 1
POLICY_DESCRIPTION: plpd_default
POLICY_ID: 2
POLICY_DESCRIPTION: plpd_optimized
NUMA:
NODE: 0
AFFINITY: 0
@@ -783,7 +794,7 @@ GPU: 1
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
DPM_POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
@@ -795,6 +806,16 @@ GPU: 1
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
XGMI_PLPD:
NUM_SUPPORTED: 3
CURRENT_ID: 1
PLPDS:
POLICY_ID: 0
POLICY_DESCRIPTION: plpd_disallow
POLICY_ID: 1
POLICY_DESCRIPTION: plpd_default
POLICY_ID: 2
POLICY_DESCRIPTION: plpd_optimized
NUMA:
NODE: 1
AFFINITY: 1
@@ -891,7 +912,7 @@ GPU: 2
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
DPM_POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
@@ -903,6 +924,16 @@ GPU: 2
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
XGMI_PLPD:
NUM_SUPPORTED: 3
CURRENT_ID: 1
PLPDS:
POLICY_ID: 0
POLICY_DESCRIPTION: plpd_disallow
POLICY_ID: 1
POLICY_DESCRIPTION: plpd_default
POLICY_ID: 2
POLICY_DESCRIPTION: plpd_optimized
NUMA:
NODE: 2
AFFINITY: 2
@@ -999,7 +1030,7 @@ GPU: 3
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
DPM_POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
@@ -1011,6 +1042,16 @@ GPU: 3
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
XGMI_PLPD:
NUM_SUPPORTED: 3
CURRENT_ID: 1
PLPDS:
POLICY_ID: 0
POLICY_DESCRIPTION: plpd_disallow
POLICY_ID: 1
POLICY_DESCRIPTION: plpd_default
POLICY_ID: 2
POLICY_DESCRIPTION: plpd_optimized
NUMA:
NODE: 3
AFFINITY: 3
@@ -244,7 +244,8 @@ class AMDSMICommands():
def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None,
limit=None, driver=None, ras=None, board=None, numa=None, vram=None,
cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None):
cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None,
policy=None, xgmi_plpd=None):
"""Get Static information for target gpu
Args:
@@ -268,6 +269,7 @@ class AMDSMICommands():
fb_info (bool, optional): Value override for args.fb_info. Defaults to None.
num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
policy (bool, optional): Value override for args.policy. Defaults to None.
xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
Returns:
None: Print output via AMDSMILogger to destination
"""
@@ -302,8 +304,10 @@ class AMDSMICommands():
args.limit = limit
if policy:
args.policy = policy
current_platform_args += ["ras", "limit", "partition", "policy"]
current_platform_values += [args.ras, args.limit, args.partition, args.policy]
if xgmi_plpd:
args.xgmi_plpd = xgmi_plpd
current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"]
current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd]
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
if numa:
@@ -630,6 +634,15 @@ class AMDSMICommands():
logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['dpm_policy'] = policy_info
if 'xgmi_plpd' in current_platform_args:
if args.xgmi_plpd:
try:
policy_info = amdsmi_interface.amdsmi_get_xgmi_plpd(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
policy_info = "N/A"
logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['xgmi_plpd'] = policy_info
if 'numa' in current_platform_args:
if args.numa:
try:
@@ -766,7 +779,7 @@ class AMDSMICommands():
bus=None, vbios=None, limit=None, driver=None, ras=None,
board=None, numa=None, vram=None, cache=None, partition=None,
dfc_ucode=None, fb_info=None, num_vf=None, cpu=None,
interface_ver=None, policy=None):
interface_ver=None, policy=None, xgmi_plpd = None):
"""Get Static information for target gpu and cpu
Args:
@@ -790,6 +803,7 @@ class AMDSMICommands():
cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None.
interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None
policy (bool, optional): Value override for args.policy. Defaults to None.
xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
@@ -815,7 +829,7 @@ class AMDSMICommands():
gpu_args_enabled = False
gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras",
"board", "numa", "vram", "cache", "partition",
"dfc_ucode", "fb_info", "num_vf", "policy"]
"dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
@@ -859,7 +873,7 @@ class AMDSMICommands():
self.static_gpu(args, multiple_devices, gpu, asic,
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf, policy)
dfc_ucode, fb_info, num_vf, policy, xgmi_plpd)
def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True):
@@ -3090,7 +3104,7 @@ class AMDSMICommands():
def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
profile=None, perf_determinism=None, compute_partition=None,
memory_partition=None, power_cap=None, dpm_policy=None):
memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3105,6 +3119,7 @@ class AMDSMICommands():
memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
@@ -3132,6 +3147,8 @@ class AMDSMICommands():
args.power_cap = power_cap
if dpm_policy:
args.dpm_policy = dpm_policy
if xgmi_plpd:
args.xgmi_plpd = xgmi_plpd
# Handle No GPU passed
if args.gpu == None:
raise ValueError('No GPU provided, specific GPU target(s) are needed')
@@ -3151,7 +3168,8 @@ class AMDSMICommands():
args.memory_partition,
args.perf_determinism is not None,
args.power_cap,
args.dpm_policy]):
args.dpm_policy,
args.xgmi_plpd]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
@@ -3225,6 +3243,15 @@ class AMDSMICommands():
raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}")
if args.xgmi_plpd:
try:
amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}")
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
@@ -3264,7 +3291,7 @@ class AMDSMICommands():
cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None,
cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None,
cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None,
soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None):
soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3294,6 +3321,7 @@ class AMDSMICommands():
core (device_handle, optional): device_handle for target core. Defaults to None.
core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None
dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
@@ -3314,7 +3342,7 @@ class AMDSMICommands():
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
"memory_partition", "power_cap", "dpm_policy"]
"memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr) is not None:
@@ -3370,7 +3398,7 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, dpm_policy)
memory_partition, power_cap, dpm_policy, xgmi_plpd)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None and args.core == None:
raise ValueError('No CPU or CORE provided, specific target(s) are needed')
@@ -3389,7 +3417,7 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, dpm_policy)
memory_partition, power_cap, dpm_policy, xgmi_plpd)
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
@@ -544,6 +544,7 @@ class AMDSMIParser(argparse.ArgumentParser):
cache_help = "All cache information"
board_help = "All board information"
dpm_policy_help = "The available DPM policy"
xgmi_plpd_help = "The available XGMI per-link power down policy"
# Options arguments help text for Hypervisors and Baremetal
ras_help = "Displays RAS features information"
@@ -584,6 +585,7 @@ class AMDSMIParser(argparse.ArgumentParser):
static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help)
static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help)
static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help)
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
@@ -966,6 +968,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
set_power_cap_help = "Set power capacity limit"
set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n"
set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n"
# Help text for CPU set options
set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value."
@@ -1002,6 +1005,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')
set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID')
set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID')
if self.helpers.is_amd_hsmp_initialized():
# Optional CPU Args
@@ -3405,6 +3405,49 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
*/
amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle,
uint32_t policy_id);
/**
* @brief Get the xgmi per-link power down policy parameter for the processor
*
* @platform{gpu_bm_linux}
*
* @details Given a processor handle @p processor_handle, this function will write
* current xgmi plpd settings to @p policy. All the processors at the same socket
* will have the same policy.
*
* @param[in] processor_handle a processor handle
*
* @param[in, out] policy the xgmi plpd for this processor.
* If this parameter is nullptr, this function will return
* ::AMDSMI_STATUS_INVAL
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
amdsmi_dpm_policy_t* xgmi_plpd);
/**
* @brief Set the xgmi per-link power down policy parameter for the processor
*
* @platform{gpu_bm_linux}
*
* @details Given a processor handle @p processor_handle and a dpm policy @p plpd_id,
* this function will set the xgmi plpd for this processor. All the processors at
* the same socket will be set to the same policy.
*
* @note This function requires root access
*
* @param[in] processor_handle a processor handle
*
* @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in
* amdsmi_dpm_policy_entry_t, which can be obtained by calling
* amdsmi_get_xgmi_plpd()
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
uint32_t plpd_id);
/** @} End PerfCont */
/*****************************************************************************/
@@ -909,8 +909,8 @@ Field | Description
`name` | Name of process
`pid` | Process ID
`mem` | Process memory usage
`engine_usage`| <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
`memory_usage`| <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function:
@@ -2612,6 +2612,74 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_set_xgmi_plpd
Description: Set the xgmi per-link power down policy parameter for the processor
Input parameters:
* `processor_handle` handle for the given device
* `policy_id` the xgmi plpd id to set.
Output: None
Exceptions that can be thrown by `amdsmi_set_xgmi_plpd` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
amdsmi_set_xgmi_plpd(device, 0)
except AmdSmiException as e:
print(e)
```
### amdsmi_get_xgmi_plpd
Description: Get the xgmi per-link power down policy parameter for the processor
Input parameters:
* `processor_handle` handle for the given device
Output: Dict containing information about xgmi per-link power down policy
Field | Description
---|---
`num_supported` | The number of supported policies
`current_id` | The current policy index
`plpds` | List of policies.
Exceptions that can be thrown by `amdsmi_get_xgmi_plpd` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
xgmi_plpd = amdsmi_get_xgmi_plpd(device)
print(xgmi_plpd)
except AmdSmiException as e:
print(e)
```
### amdsmi_set_gpu_overdrive_level
Description: **deprecated** Set the overdrive percent associated with the
@@ -2746,6 +2746,20 @@ def amdsmi_set_dpm_policy(
)
)
def amdsmi_set_xgmi_plpd(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
policy_id: int,
):
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
_check_res(
amdsmi_wrapper.amdsmi_set_xgmi_plpd(
processor_handle, policy_id
)
)
def amdsmi_set_gpu_overdrive_level(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int
):
@@ -3335,6 +3349,37 @@ def amdsmi_get_dpm_policy(
"policies": polices,
}
def amdsmi_get_xgmi_plpd(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
policy = amdsmi_wrapper.amdsmi_dpm_policy_t()
_check_res(
amdsmi_wrapper.amdsmi_get_xgmi_plpd(
processor_handle, ctypes.byref(policy)
)
)
polices = []
for i in range(0, policy.num_supported):
id = policy.policies[i].policy_id
desc = policy.policies[i].policy_description
polices.append({
'policy_id' : id,
'policy_description': desc.decode()
})
current_id = policy.policies[policy.current].policy_id
return {
"num_supported": policy.num_supported,
"current_id": current_id,
"plpds": polices,
}
def amdsmi_get_gpu_od_volt_info(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
@@ -746,19 +746,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
class struct_amdsmi_pcie_info_t(Structure):
pass
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('reserved', ctypes.c_uint64 * 10),
]
class struct_pcie_metric_(Structure):
pass
@@ -777,6 +764,19 @@ struct_pcie_metric_._fields_ = [
('reserved', ctypes.c_uint64 * 13),
]
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('reserved', ctypes.c_uint64 * 10),
]
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
struct_amdsmi_pcie_info_t._fields_ = [
('pcie_static', struct_pcie_static_),
@@ -2058,6 +2058,12 @@ amdsmi_get_dpm_policy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct
amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy
amdsmi_set_dpm_policy.restype = amdsmi_status_t
amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t]
amdsmi_get_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_get_xgmi_plpd
amdsmi_get_xgmi_plpd.restype = amdsmi_status_t
amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)]
amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd
amdsmi_set_xgmi_plpd.restype = amdsmi_status_t
amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t]
amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version
amdsmi_get_lib_version.restype = amdsmi_status_t
amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)]
@@ -2594,8 +2600,9 @@ __all__ = \
'amdsmi_get_processor_info', 'amdsmi_get_processor_type',
'amdsmi_get_socket_handles', 'amdsmi_get_socket_info',
'amdsmi_get_temp_metric', 'amdsmi_get_utilization_count',
'amdsmi_get_xgmi_info', 'amdsmi_gpu_block_t',
'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter',
'amdsmi_get_xgmi_info', 'amdsmi_get_xgmi_plpd',
'amdsmi_gpu_block_t', 'amdsmi_gpu_cache_info_t',
'amdsmi_gpu_control_counter',
'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter',
'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t',
'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status',
@@ -2636,10 +2643,10 @@ __all__ = \
'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth',
'amdsmi_set_gpu_perf_determinism_mode',
'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile',
'amdsmi_set_power_cap', 'amdsmi_shut_down',
'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle',
'amdsmi_status_code_to_string', 'amdsmi_status_t',
'amdsmi_stop_gpu_event_notification',
'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd',
'amdsmi_shut_down', 'amdsmi_smu_fw_version_t',
'amdsmi_socket_handle', 'amdsmi_status_code_to_string',
'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification',
'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t',
'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type',
'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number',
@@ -3364,6 +3364,45 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind,
rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind,
uint32_t policy_id);
/**
* @brief Get the xgmi per-link power down policy parameter for a device
*
*
* @details Given a device index @p dv_ind, this function will write
* current xgmi plpd settings to @p xgmi_plpd. All the processors at the same socket
* will have the same policy.
*
* @param[in] dv_ind a device index
*
* @param[in, out] xgmi_plpd the xgmi_plpd policy for this device.
* If this parameter is nullptr, this function will return
* ::RSMI_STATUS_INVAL
*
* @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
*/
rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind,
rsmi_dpm_policy_t* xgmi_plpd);
/**
* @brief Set the xgmi per-link power down policy parameter for a device
*
*
* @details Given a device index @p dv_ind, and a dpm policy @p plpd_id,
* this function will set the xgmi plpd for this processor. All the processors at
* the same socket will be set to the same policy.
*
* @note This function requires root access
*
* @param[in] processor_handle a processor handle
*
* @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in
* rsmi_dpm_policy_entry_t, which can be obtained by calling
* rsmi_dev_xgmi_plpd_get()
*
* @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
*/
rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind,
uint32_t plpd_id);
/** @} */ // end of PerfCont
/*****************************************************************************/
@@ -2038,6 +2038,130 @@ rsmi_dev_dpm_policy_set(uint32_t dv_ind,
CATCH
}
rsmi_status_t
rsmi_dev_xgmi_plpd_get(uint32_t dv_ind,
rsmi_dpm_policy_t* policy) {
rsmi_status_t ret;
std::vector<std::string> val_vec;
if (policy == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
*policy = {};
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
DEVICE_MUTEX
ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR "
<< "-> reporting RSMI_STATUS_NOT_SUPPORTED";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS"
<< " -> reporting " << amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
/*
It will reply on the number but no string as it may vary from soc to soc.
The current xmgi plpd marked with *
xgmi plpd
0 : plpd_disallow
1 : plpd_default
2 : plpd_optimized*
*/
bool see_plpd_pstate = false;
bool see_current = false;
policy->num_supported = 0;
for (uint32_t i = 0; i < val_vec.size(); ++i) {
auto current_line = amd::smi::trim(val_vec[i]);
if (current_line == "xgmi plpd") {
see_plpd_pstate = true;
continue;
}
if (see_plpd_pstate == false) continue;
// Get tokens: <integer> : <string *>
std::vector<std::string> tokens;
std::istringstream f(current_line);
std::string s;
while (getline(f, s, ':')) {
tokens.push_back(s);
}
int value = 0;
// At the end
if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) {
break;
}
if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", Unexpected pstat data: the id is negative or too many plpd policies.";
LOG_ERROR(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
policy->policies[policy->num_supported].policy_id = value;
std::string description = amd::smi::trim(tokens[1]);
if (current_line.back() == '*') { // current policy
description.pop_back(); // remove last *
description = amd::smi::trim(description);
policy->current = policy->num_supported;
see_current = true;
}
strncpy(policy->policies[policy->num_supported].policy_description,
description.c_str(),
RSMI_MAX_POLICY_NAME-1);
policy->num_supported++;
} // end for
if (!see_plpd_pstate) {
return RSMI_STATUS_NOT_SUPPORTED;
}
if (!see_current) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", Unexpected pstat data: cannot find the current plpd policy.";
LOG_ERROR(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
// Cannot find it
return RSMI_STATUS_SUCCESS;
CATCH
}
rsmi_status_t
rsmi_dev_xgmi_plpd_set(uint32_t dv_ind,
uint32_t plpd_id) {
rsmi_status_t ret;
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
std::string value("xgmi ");
value += std::to_string(plpd_id);
int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value);
return amd::smi::ErrnoToRsmiStatus(ret);
CATCH
}
rsmi_status_t
rsmi_dev_dpm_policy_get(uint32_t dv_ind,
rsmi_dpm_policy_t* policy) {
@@ -2107,7 +2231,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind,
if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", Unexpeced pstat data: the id is negative or too many policies.";
<< ", Unexpected pstat data: the id is negative or too many policies.";
LOG_ERROR(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
@@ -2132,7 +2256,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind,
if (!see_current) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", Unexpeced pstat data: cannot find the current policy.";
<< ", Unexpected pstat data: cannot find the current policy.";
LOG_ERROR(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
@@ -536,8 +536,10 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
{"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}},
{"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_xgmi_plpd_get", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_xgmi_plpd_set", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}},
{"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}},
{"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}},
@@ -1369,6 +1369,22 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
reinterpret_cast<rsmi_dpm_policy_t*>(policy));
}
amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
uint32_t policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle,
policy);
}
amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
amdsmi_dpm_policy_t* policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle,
reinterpret_cast<rsmi_dpm_policy_t*>(policy));
}
amdsmi_status_t
amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle,
uint32_t *num_pages,