Set and get DPM policy for GPU device

Add new APIs to set and get dpm policy for the GPU device.

Change-Id: I26fa49cd17d0ce66bda3446c38945a6cf35717ff


[ROCm/amdsmi commit: 108e6d4ae6]
Этот коммит содержится в:
Bill(Shuzhou) Liu
2024-02-22 08:38:54 -06:00
коммит произвёл Shuzhou Liu
родитель 010d839dca
Коммит 46ab68f840
15 изменённых файлов: 506 добавлений и 39 удалений
+50 -1
Просмотреть файл
@@ -474,7 +474,7 @@ Command Modifiers:
```bash
usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %]
[-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION]
[-o WATTS] [--cpu-pwr-limit PWR_LIMIT]
[-o WATTS] [-p POLICY] [--cpu-pwr-limit PWR_LIMIT]
[--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH]
[--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE]
[--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE]
@@ -512,6 +512,7 @@ Set Arguments:
-M, --memory-partition PARTITION Set one of the following the memory partition modes:
NPS1, NPS2, NPS4, NPS8
-o, --power-cap WATTS Set power capacity limit
-p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id
CPU Arguments:
--cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value.
@@ -674,6 +675,18 @@ GPU: 0
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
POLICY_ID: 0
POLICY_DESCRIPTION: pstate_default
POLICY_ID: 1
POLICY_DESCRIPTION: soc_pstate_0
POLICY_ID: 2
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
NUMA:
NODE: 0
AFFINITY: 0
@@ -770,6 +783,18 @@ GPU: 1
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
POLICY_ID: 0
POLICY_DESCRIPTION: pstate_default
POLICY_ID: 1
POLICY_DESCRIPTION: soc_pstate_0
POLICY_ID: 2
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
NUMA:
NODE: 1
AFFINITY: 1
@@ -866,6 +891,18 @@ GPU: 2
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
POLICY_ID: 0
POLICY_DESCRIPTION: pstate_default
POLICY_ID: 1
POLICY_DESCRIPTION: soc_pstate_0
POLICY_ID: 2
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
NUMA:
NODE: 2
AFFINITY: 2
@@ -962,6 +999,18 @@ GPU: 3
PARTITION:
COMPUTE_PARTITION: SPX
MEMORY_PARTITION: NPS1
POLICY:
NUM_SUPPORTED: 4
CURRENT_ID: 1
POLICIES:
POLICY_ID: 0
POLICY_DESCRIPTION: pstate_default
POLICY_ID: 1
POLICY_DESCRIPTION: soc_pstate_0
POLICY_ID: 2
POLICY_DESCRIPTION: soc_pstate_1
POLICY_ID: 3
POLICY_DESCRIPTION: soc_pstate_2
NUMA:
NODE: 3
AFFINITY: 3
+42 -16
Просмотреть файл
@@ -244,7 +244,7 @@ class AMDSMICommands():
def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None,
limit=None, driver=None, ras=None, board=None, numa=None, vram=None,
cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None):
cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None):
"""Get Static information for target gpu
Args:
@@ -267,7 +267,7 @@ class AMDSMICommands():
dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None.
fb_info (bool, optional): Value override for args.fb_info. Defaults to None.
num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
policy (bool, optional): Value override for args.policy. Defaults to None.
Returns:
None: Print output via AMDSMILogger to destination
"""
@@ -300,8 +300,10 @@ class AMDSMICommands():
args.partition = partition
if limit:
args.limit = limit
current_platform_args += ["ras", "limit", "partition"]
current_platform_values += [args.ras, args.limit, args.partition]
if policy:
args.policy = policy
current_platform_args += ["ras", "limit", "partition", "policy"]
current_platform_values += [args.ras, args.limit, args.partition, args.policy]
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
if numa:
@@ -486,6 +488,7 @@ class AMDSMICommands():
shutdown_temp_vram_limit = "N/A"
logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
# Assign units
power_unit = 'W'
temp_unit_human_readable = '\N{DEGREE SIGN}C'
@@ -626,6 +629,15 @@ class AMDSMICommands():
static_dict['partition'] = {"compute_partition": compute_partition,
"memory_partition": memory_partition}
if 'policy' in current_platform_args:
if args.policy:
try:
policy_info = amdsmi_interface.amdsmi_get_dpm_policy(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
policy_info = "N/A"
logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['dpm_policy'] = policy_info
if 'numa' in current_platform_args:
if args.numa:
try:
@@ -762,7 +774,7 @@ class AMDSMICommands():
bus=None, vbios=None, limit=None, driver=None, ras=None,
board=None, numa=None, vram=None, cache=None, partition=None,
dfc_ucode=None, fb_info=None, num_vf=None, cpu=None,
interface_ver=None):
interface_ver=None, policy=None):
"""Get Static information for target gpu and cpu
Args:
@@ -785,7 +797,7 @@ class AMDSMICommands():
num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None.
interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None
policy (bool, optional): Value override for args.policy. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
@@ -811,7 +823,7 @@ class AMDSMICommands():
gpu_args_enabled = False
gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras",
"board", "numa", "vram", "cache", "partition",
"dfc_ucode", "fb_info", "num_vf"]
"dfc_ucode", "fb_info", "num_vf", "policy"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
@@ -841,7 +853,7 @@ class AMDSMICommands():
self.static_gpu(args, multiple_devices, gpu, asic,
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf)
dfc_ucode, fb_info, num_vf, policy)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None:
args.cpu = self.cpu_handles
@@ -855,7 +867,7 @@ class AMDSMICommands():
self.static_gpu(args, multiple_devices, gpu, asic,
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf)
dfc_ucode, fb_info, num_vf, policy)
def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True):
@@ -3096,7 +3108,7 @@ class AMDSMICommands():
def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
profile=None, perf_determinism=None, compute_partition=None,
memory_partition=None, power_cap=None):
memory_partition=None, power_cap=None, dpm_policy=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3110,6 +3122,7 @@ class AMDSMICommands():
compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None.
memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
@@ -3135,7 +3148,8 @@ class AMDSMICommands():
args.memory_partition = memory_partition
if power_cap:
args.power_cap = power_cap
if dpm_policy:
args.dpm_policy = dpm_policy
# Handle No GPU passed
if args.gpu == None:
raise ValueError('No GPU provided, specific GPU target(s) are needed')
@@ -3154,7 +3168,8 @@ class AMDSMICommands():
args.compute_partition,
args.memory_partition,
args.perf_determinism is not None,
args.power_cap]):
args.power_cap,
args.dpm_policy]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
@@ -3218,6 +3233,16 @@ class AMDSMICommands():
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}")
if args.dpm_policy:
try:
amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}")
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
@@ -3257,7 +3282,7 @@ class AMDSMICommands():
cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None,
cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None,
cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None,
soc_boost_limit=None, core=None, core_boost_limit=None):
soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3286,6 +3311,7 @@ class AMDSMICommands():
core (device_handle, optional): device_handle for target core. Defaults to None.
core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None
dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
@@ -3306,7 +3332,7 @@ class AMDSMICommands():
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
"memory_partition", "power_cap"]
"memory_partition", "power_cap", "dpm_policy"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr) is not None:
@@ -3367,7 +3393,7 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap)
memory_partition, power_cap, dpm_policy)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None and args.core == None:
raise ValueError('No CPU or CORE provided, specific target(s) are needed')
@@ -3386,7 +3412,7 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap)
memory_partition, power_cap, dpm_policy)
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
-1
Просмотреть файл
@@ -632,7 +632,6 @@ class AMDSMIHelpers():
compute_partitions_str.remove('INVALID')
return compute_partitions_str
def get_memory_partition_types(self):
memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType]
if 'UNKNOWN' in memory_partitions_str:
+4
Просмотреть файл
@@ -543,6 +543,7 @@ class AMDSMIParser(argparse.ArgumentParser):
vram_help = "All vram information"
cache_help = "All cache information"
board_help = "All board information"
dpm_policy_help = "The available DPM policy"
# Options arguments help text for Hypervisors and Baremetal
ras_help = "Displays RAS features information"
@@ -582,6 +583,7 @@ class AMDSMIParser(argparse.ArgumentParser):
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help)
static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help)
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
@@ -963,6 +965,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}"
set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
set_power_cap_help = "Set power capacity limit"
set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n"
# Help text for CPU set options
set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value."
@@ -998,6 +1001,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION')
set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')
set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID')
if self.helpers.is_amd_hsmp_initialized():
# Optional CPU Args
+12
Просмотреть файл
@@ -331,6 +331,18 @@ int main() {
printf(" Output of amdsmi_get_power_cap_info:\n");
std::cout << "\t\t Power Cap: " << cap_info.power_cap / 1000000
<< "W\n\n";
amdsmi_dpm_policy_t policy;
ret = amdsmi_get_dpm_policy(processor_handles[j], &policy);
if (ret != AMDSMI_STATUS_NOT_SUPPORTED) {
CHK_AMDSMI_RET(ret)
std::cout << "\t amdsmi_get_dpm_policy total:" << policy.num_supported
<<" current:" << policy.current << "\n";
for (int x=0; x < policy.num_supported; x++) {
std::cout << x <<": (" << policy.policies[x].policy_id
<<"," << policy.policies[x].policy_description << ")\n";
}
}
}
}
+73 -1
Просмотреть файл
@@ -151,7 +151,7 @@ typedef enum {
#define AMDSMI_LIB_VERSION_YEAR 24
//! Major version should be changed for every header change (adding/deleting APIs, changing names, fields of structures, etc.)
#define AMDSMI_LIB_VERSION_MAJOR 4
#define AMDSMI_LIB_VERSION_MAJOR 5
//! Minor version should be updated for each API change, but without changing headers
#define AMDSMI_LIB_VERSION_MINOR 0
@@ -1151,6 +1151,37 @@ typedef struct {
uint64_t frequency[AMDSMI_MAX_NUM_FREQUENCIES];
} amdsmi_frequencies_t;
/**
* @brief The dpm policy.
*/
typedef struct {
uint32_t policy_id;
char policy_description[AMDSMI_MAX_NAME];
} amdsmi_dpm_policy_entry_t;
#define AMDSMI_MAX_NUM_PM_POLICIES 32
/**
* @brief This structure holds information about dpm policies.
*/
typedef struct {
/**
* The number of supported policies
*/
uint32_t num_supported;
/**
* The current policy index
*/
uint32_t current;
/**
* List of policies.
* Only the first num_supported policies are valid.
*/
amdsmi_dpm_policy_entry_t policies[AMDSMI_MAX_NUM_PM_POLICIES];
} amdsmi_dpm_policy_t;
/**
* @brief This structure holds information about the possible PCIe
* bandwidths. Specifically, the possible transfer rates and their
@@ -3333,6 +3364,47 @@ amdsmi_status_t amdsmi_set_gpu_overdrive_level(amdsmi_processor_handle processor
amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle,
amdsmi_clk_type_t clk_type, uint64_t freq_bitmask);
/**
* @brief Get the dpm policy for the processor
*
* @platform{gpu_bm_linux} @platform{guest_1vf}
*
* @details Given a processor handle @p processor_handle, this function will write
* current dpm policy settings to @p policy. All the processors at the same socket
* will have the same policy.
*
* @param[in] processor_handle a processor handle
*
* @param[in, out] policy the dpm policy for this processor.
* If this parameter is nullptr, this function will return
* ::AMDSMI_STATUS_INVAL
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
amdsmi_dpm_policy_t* policy);
/**
* @brief Set the dpm policy for the processor
*
* @platform{gpu_bm_linux} @platform{guest_1vf}
*
* @details Given a processor handle @p processor_handle and a dpm policy @p policy_id,
* this function will set the dpm policy for this processor. All the processors at
* the same socket will be set to the same policy.
*
* @note This function requires root access
*
* @param[in] processor_handle a processor handle
*
* @param[in] policy_id the dpm policy id to set. The id is the id in
* amdsmi_dpm_policy_entry_t, which can be obtained by calling
* amdsmi_get_dpm_policy()
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle,
uint32_t policy_id);
/** @} End PerfCont */
/*****************************************************************************/
+43 -1
Просмотреть файл
@@ -360,7 +360,6 @@ class AmdSmiProcessorType(IntEnum):
NON_AMD_GPU = amdsmi_wrapper.NON_AMD_GPU
NON_AMD_CPU = amdsmi_wrapper.NON_AMD_CPU
class AmdSmiEventReader:
def __init__(
self, processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
@@ -2690,6 +2689,19 @@ def amdsmi_set_clk_freq(
)
)
def amdsmi_set_dpm_policy(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
policy_id: int,
):
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
_check_res(
amdsmi_wrapper.amdsmi_set_dpm_policy(
processor_handle, policy_id
)
)
def amdsmi_set_gpu_overdrive_level(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int
@@ -3249,6 +3261,36 @@ def amdsmi_get_clk_freq(
"frequency": list(freq.frequency)[: freq.num_supported - 1],
}
def amdsmi_get_dpm_policy(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
policy = amdsmi_wrapper.amdsmi_dpm_policy_t()
_check_res(
amdsmi_wrapper.amdsmi_get_dpm_policy(
processor_handle, ctypes.byref(policy)
)
)
polices = []
for i in range(0, policy.num_supported):
id = policy.policies[i].policy_id
desc = policy.policies[i].policy_description
polices.append({
'policy_id' : id,
'policy_description': desc.decode()
})
current_id = policy.policies[policy.current].policy_id
return {
"num_supported": policy.num_supported,
"current_id": current_id,
"policies": polices,
}
def amdsmi_get_gpu_od_volt_info(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
+49 -19
Просмотреть файл
@@ -746,6 +746,19 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
class struct_amdsmi_pcie_info_t(Structure):
pass
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('reserved', ctypes.c_uint64 * 10),
]
class struct_pcie_metric_(Structure):
pass
@@ -764,19 +777,6 @@ struct_pcie_metric_._fields_ = [
('reserved', ctypes.c_uint64 * 13),
]
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('reserved', ctypes.c_uint64 * 10),
]
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
struct_amdsmi_pcie_info_t._fields_ = [
('pcie_static', struct_pcie_static_),
@@ -1480,6 +1480,27 @@ struct_amdsmi_frequencies_t._fields_ = [
]
amdsmi_frequencies_t = struct_amdsmi_frequencies_t
class struct_amdsmi_dpm_policy_entry_t(Structure):
pass
struct_amdsmi_dpm_policy_entry_t._pack_ = 1 # source:False
struct_amdsmi_dpm_policy_entry_t._fields_ = [
('policy_id', ctypes.c_uint32),
('policy_description', ctypes.c_char * 32),
]
amdsmi_dpm_policy_entry_t = struct_amdsmi_dpm_policy_entry_t
class struct_amdsmi_dpm_policy_t(Structure):
pass
struct_amdsmi_dpm_policy_t._pack_ = 1 # source:False
struct_amdsmi_dpm_policy_t._fields_ = [
('num_supported', ctypes.c_uint32),
('current', ctypes.c_uint32),
('policies', struct_amdsmi_dpm_policy_entry_t * 32),
]
amdsmi_dpm_policy_t = struct_amdsmi_dpm_policy_t
class struct_amdsmi_pcie_bandwidth_t(Structure):
pass
@@ -2030,6 +2051,12 @@ amdsmi_set_gpu_overdrive_level.argtypes = [amdsmi_processor_handle, uint32_t]
amdsmi_set_clk_freq = _libraries['libamd_smi.so'].amdsmi_set_clk_freq
amdsmi_set_clk_freq.restype = amdsmi_status_t
amdsmi_set_clk_freq.argtypes = [amdsmi_processor_handle, amdsmi_clk_type_t, uint64_t]
amdsmi_get_dpm_policy = _libraries['libamd_smi.so'].amdsmi_get_dpm_policy
amdsmi_get_dpm_policy.restype = amdsmi_status_t
amdsmi_get_dpm_policy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)]
amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy
amdsmi_set_dpm_policy.restype = amdsmi_status_t
amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t]
amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version
amdsmi_get_lib_version.restype = amdsmi_status_t
amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)]
@@ -2486,7 +2513,8 @@ __all__ = \
'amdsmi_cpu_apb_enable', 'amdsmi_cpusocket_handle',
'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t',
'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t',
'amdsmi_dpm_level_t', 'amdsmi_driver_info_t',
'amdsmi_dpm_level_t', 'amdsmi_dpm_policy_entry_t',
'amdsmi_dpm_policy_t', 'amdsmi_driver_info_t',
'amdsmi_engine_usage_t', 'amdsmi_error_count_t',
'amdsmi_event_group_t', 'amdsmi_event_handle_t',
'amdsmi_event_type_t', 'amdsmi_evt_notification_data_t',
@@ -2516,10 +2544,10 @@ __all__ = \
'amdsmi_get_cpu_socket_power', 'amdsmi_get_cpu_socket_power_cap',
'amdsmi_get_cpu_socket_power_cap_max',
'amdsmi_get_cpu_socket_temperature', 'amdsmi_get_cpucore_handles',
'amdsmi_get_cpusocket_handles', 'amdsmi_get_energy_count',
'amdsmi_get_esmi_err_msg', 'amdsmi_get_fw_info',
'amdsmi_get_gpu_activity', 'amdsmi_get_gpu_asic_info',
'amdsmi_get_gpu_available_counters',
'amdsmi_get_cpusocket_handles', 'amdsmi_get_dpm_policy',
'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg',
'amdsmi_get_fw_info', 'amdsmi_get_gpu_activity',
'amdsmi_get_gpu_asic_info', 'amdsmi_get_gpu_available_counters',
'amdsmi_get_gpu_bad_page_info', 'amdsmi_get_gpu_bdf_id',
'amdsmi_get_gpu_board_info', 'amdsmi_get_gpu_cache_info',
'amdsmi_get_gpu_compute_partition',
@@ -2599,7 +2627,8 @@ __all__ = \
'amdsmi_set_cpu_socket_boostlimit',
'amdsmi_set_cpu_socket_lclk_dpm_level',
'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width',
'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition',
'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clk_range',
'amdsmi_set_gpu_compute_partition',
'amdsmi_set_gpu_event_notification_mask',
'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition',
'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info',
@@ -2625,6 +2654,7 @@ __all__ = \
'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t',
'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t',
'struct_amdsmi_dimm_thermal_t', 'struct_amdsmi_dpm_level_t',
'struct_amdsmi_dpm_policy_entry_t', 'struct_amdsmi_dpm_policy_t',
'struct_amdsmi_driver_info_t', 'struct_amdsmi_engine_usage_t',
'struct_amdsmi_error_count_t',
'struct_amdsmi_evt_notification_data_t',
+69
Просмотреть файл
@@ -192,6 +192,39 @@ typedef enum {
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 //!< Unknown performance level
} rsmi_dev_perf_level_t;
#define RSMI_MAX_NUM_PM_POLICIES 32
#define RSMI_MAX_POLICY_NAME 32
/**
* @brief The dpm policy.
*/
typedef struct {
uint32_t policy_id;
char policy_description[RSMI_MAX_POLICY_NAME];
} rsmi_dpm_policy_entry_t;
/**
* @brief This structure holds information about dpm policies.
*/
typedef struct {
/**
* The number of supported policies
*/
uint32_t num_supported;
/**
* The current policy index
*/
uint32_t current;
/**
* List of policies.
* Only the first num_supported policies are valid.
*/
rsmi_dpm_policy_entry_t policies[RSMI_MAX_NUM_PM_POLICIES];
} rsmi_dpm_policy_t;
/// \cond Ignore in docs.
typedef rsmi_dev_perf_level_t rsmi_dev_perf_level;
/// \endcond
@@ -3295,6 +3328,42 @@ rsmi_status_t rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od);
rsmi_status_t rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
rsmi_clk_type_t clk_type, uint64_t freq_bitmask);
/**
* @brief Get the dpm policy for a device
*
* @details Given a device index @p dv_ind, this function will write
* current dpm policy settings to @p policy. All the devices at the same socket
* will have the same policy.
*
* @param[in] dv_ind a device index
*
* @param[in, out] policy the dpm policy for this device.
* If this parameter is nullptr, this function will return
* ::RSMI_STATUS_INVAL
*
* @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
*/
rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind,
rsmi_dpm_policy_t* policy);
/**
* @brief Set the dpm policy for a device
*
* @details Given a device index @p dv_ind and a dpm policy @p policy_id,
* this function will set the DPM policy for this device. All the devices at
* the same socket will be set to the same policy.
*
* @note This function requires root access
*
* @param[in] processor_handle a processor handle
*
* @param[in] policy_id the dpm policy will be modified
*
* @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
*/
rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind,
uint32_t policy_id);
/** @} */ // end of PerfCont
/*****************************************************************************/
+1
Просмотреть файл
@@ -173,6 +173,7 @@ enum DevInfoTypes {
kDevNumaNode,
kDevGpuMetrics,
kDevPmMetrics,
kDevDPMPolicy,
kDevRegMetrics,
kDevGpuReset,
kDevAvailableComputePartition,
+1
Просмотреть файл
@@ -78,6 +78,7 @@ int isRegularFile(std::string fname, bool *is_reg);
int ReadSysfsStr(std::string path, std::string *retStr);
int WriteSysfsStr(std::string path, std::string val);
bool IsInteger(const std::string & n_str);
bool stringToInteger(const std::string & n_str, int& value);
std::pair<bool, std::string> executeCommand(std::string command,
bool stdOut = true);
rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
+128
Просмотреть файл
@@ -145,6 +145,7 @@ static uint64_t get_multiplier_from_str(char units_char) {
return multiplier;
}
/**
* Parse a string of the form:
* "<int index>: <int freq><freq. unit string> <|*>"
@@ -2014,6 +2015,133 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
CATCH
}
rsmi_status_t
rsmi_dev_dpm_policy_set(uint32_t dv_ind,
uint32_t policy_id) {
rsmi_status_t ret;
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
std::string value("soc_pstate ");
value += std::to_string(policy_id);
int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value);
return amd::smi::ErrnoToRsmiStatus(ret);
CATCH
}
rsmi_status_t
rsmi_dev_dpm_policy_get(uint32_t dv_ind,
rsmi_dpm_policy_t* policy) {
rsmi_status_t ret;
std::vector<std::string> val_vec;
if (policy == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
*policy = {};
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
DEVICE_MUTEX
ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR "
<< "-> reporting RSMI_STATUS_NOT_SUPPORTED";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS"
<< " -> reporting " << amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
/*
It will reply on the number but no string as it may vary from soc to soc.
The current pstate marked with *
soc pstate
0 : soc_pstate_default
1 : soc_pstate_0
2 : soc_pstate_1*
3 : soc_pstate_2
*/
bool see_soc_pstate = false;
bool see_current = false;
policy->num_supported = 0;
for (uint32_t i = 0; i < val_vec.size(); ++i) {
auto current_line = amd::smi::trim(val_vec[i]);
if (current_line == "soc pstate") {
see_soc_pstate = true;
continue;
}
if (see_soc_pstate == false) continue;
// Get tokens: <integer> : <string *>
std::vector<std::string> tokens;
std::istringstream f(current_line);
std::string s;
while (getline(f, s, ':')) {
tokens.push_back(s);
}
int value = 0;
// At the end
if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) {
break;
}
if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", Unexpeced pstat data: the id is negative or too many policies.";
LOG_ERROR(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
policy->policies[policy->num_supported].policy_id = value;
std::string description = amd::smi::trim(tokens[1]);
if (current_line.back() == '*') { // current policy
description.pop_back(); // remove last *
description = amd::smi::trim(description);
policy->current = policy->num_supported;
see_current = true;
}
strncpy(policy->policies[policy->num_supported].policy_description,
description.c_str(),
RSMI_MAX_POLICY_NAME-1);
policy->num_supported++;
} // end for
if (!see_soc_pstate) {
return RSMI_STATUS_NOT_SUPPORTED;
}
if (!see_current) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", Unexpeced pstat data: cannot find the current policy.";
LOG_ERROR(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
// Cannot find it
return RSMI_STATUS_SUCCESS;
CATCH
}
static std::vector<std::string> pci_name_files = {
"/usr/share/misc/pci.ids",
"/usr/share/hwdata/pci.ids",
+7
Просмотреть файл
@@ -136,6 +136,7 @@ static const char *kDevAvailableComputePartitionFName =
"available_compute_partition";
static const char *kDevComputePartitionFName = "current_compute_partition";
static const char *kDevMemoryPartitionFName = "current_memory_partition";
static const char* kDevDPMPolicyFName = "pm_policy"; // The PM policy for pstat and XGMI
// Firmware version files
static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version";
@@ -315,6 +316,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevNumaNode, kDevNumaNodeFName},
{kDevGpuMetrics, kDevGpuMetricsFName},
{kDevPmMetrics, kDevPmMetricsFName},
{kDevDPMPolicy, kDevDPMPolicyFName},
{kDevRegMetrics, kDevRegMetricsFName},
{kDevGpuReset, kDevGpuResetFName},
{kDevAvailableComputePartition, kDevAvailableComputePartitionFName},
@@ -472,6 +474,7 @@ Device::devInfoTypesStrings = {
{kDevComputePartition, "kDevComputePartition"},
{kDevMemoryPartition, "kDevMemoryPartition"},
{kDevPCieVendorID, "kDevPCieVendorID"},
{kDevDPMPolicy, "kDevDPMPolicy"},
};
static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
@@ -533,6 +536,8 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
{"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}},
{"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}},
{"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}},
{"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}},
{"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}},
@@ -938,6 +943,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) {
case kDevPCIEClk:
case kDevPowerODVoltage:
case kDevSOCClk:
case kDevDPMPolicy:
return writeDevInfoStr(type, val);
case kDevComputePartition:
case kDevMemoryPartition:
@@ -1219,6 +1225,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
case kDevErrCntHDP:
case kDevErrCntXGMIWAFL:
case kDevMemPageBad:
case kDevDPMPolicy:
return readDevInfoMultiLineStr(type, val);
break;
+10
Просмотреть файл
@@ -257,6 +257,16 @@ bool IsInteger(const std::string & n_str) {
return (*tmp == 0);
}
bool stringToInteger(const std::string & n_str, int& value) {
try {
value = std::stoi(trim(n_str), nullptr);
return true;
} catch (...) {
return false;
}
return false;
}
rsmi_status_t handleException() {
try {
throw;
+17
Просмотреть файл
@@ -1352,6 +1352,23 @@ amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle,
return rsmi_wrapper(rsmi_dev_gpu_clk_freq_set, processor_handle,
static_cast<rsmi_clk_type_t>(clk_type), freq_bitmask);
}
amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle,
uint32_t policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_dpm_policy_set, processor_handle,
policy);
}
amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
amdsmi_dpm_policy_t* policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_dpm_policy_get, processor_handle,
reinterpret_cast<rsmi_dpm_policy_t*>(policy));
}
amdsmi_status_t
amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle,
uint32_t *num_pages,