From 46ab68f84051b67e5499bd480cc776480cd5ee60 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 22 Feb 2024 08:38:54 -0600 Subject: [PATCH] Set and get DPM policy for GPU device Add new APIs to set and get dpm policy for the GPU device. Change-Id: I26fa49cd17d0ce66bda3446c38945a6cf35717ff [ROCm/amdsmi commit: 108e6d4ae6754ef1805ebc1fd0119754e4f982f8] --- projects/amdsmi/amdsmi_cli/README.md | 51 ++++++- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 58 +++++--- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 1 - projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 4 + .../amdsmi/example/amd_smi_nodrm_example.cc | 12 ++ projects/amdsmi/include/amd_smi/amdsmi.h | 74 +++++++++- .../amdsmi/py-interface/amdsmi_interface.py | 44 +++++- .../amdsmi/py-interface/amdsmi_wrapper.py | 68 +++++++--- .../rocm_smi/include/rocm_smi/rocm_smi.h | 69 ++++++++++ .../include/rocm_smi/rocm_smi_device.h | 1 + .../include/rocm_smi/rocm_smi_utils.h | 1 + projects/amdsmi/rocm_smi/src/rocm_smi.cc | 128 ++++++++++++++++++ .../amdsmi/rocm_smi/src/rocm_smi_device.cc | 7 + .../amdsmi/rocm_smi/src/rocm_smi_utils.cc | 10 ++ projects/amdsmi/src/amd_smi/amd_smi.cc | 17 +++ 15 files changed, 506 insertions(+), 39 deletions(-) diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md index abded835a8..cf2b81df28 100644 --- a/projects/amdsmi/amdsmi_cli/README.md +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -474,7 +474,7 @@ Command Modifiers: ```bash usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION] - [-o WATTS] [--cpu-pwr-limit PWR_LIMIT] + [-o WATTS] [-p POLICY] [--cpu-pwr-limit PWR_LIMIT] [--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH] [--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE] [--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE] @@ -512,6 +512,7 @@ Set Arguments: -M, --memory-partition PARTITION Set one of the following the memory partition modes: NPS1, NPS2, NPS4, NPS8 -o, --power-cap WATTS Set power capacity limit + -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. @@ -674,6 +675,18 @@ GPU: 0 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 0 AFFINITY: 0 @@ -770,6 +783,18 @@ GPU: 1 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 1 AFFINITY: 1 @@ -866,6 +891,18 @@ GPU: 2 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 2 AFFINITY: 2 @@ -962,6 +999,18 @@ GPU: 3 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 3 AFFINITY: 3 diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index a919b47218..697513f5ba 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -244,7 +244,7 @@ class AMDSMICommands(): def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, - cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None): + cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None): """Get Static information for target gpu Args: @@ -267,7 +267,7 @@ class AMDSMICommands(): dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None. fb_info (bool, optional): Value override for args.fb_info. Defaults to None. num_vf (bool, optional): Value override for args.num_vf. Defaults to None. - + policy (bool, optional): Value override for args.policy. Defaults to None. Returns: None: Print output via AMDSMILogger to destination """ @@ -300,8 +300,10 @@ class AMDSMICommands(): args.partition = partition if limit: args.limit = limit - current_platform_args += ["ras", "limit", "partition"] - current_platform_values += [args.ras, args.limit, args.partition] + if policy: + args.policy = policy + current_platform_args += ["ras", "limit", "partition", "policy"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -486,6 +488,7 @@ class AMDSMICommands(): shutdown_temp_vram_limit = "N/A" logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + # Assign units power_unit = 'W' temp_unit_human_readable = '\N{DEGREE SIGN}C' @@ -626,6 +629,15 @@ class AMDSMICommands(): static_dict['partition'] = {"compute_partition": compute_partition, "memory_partition": memory_partition} + if 'policy' in current_platform_args: + if args.policy: + try: + policy_info = amdsmi_interface.amdsmi_get_dpm_policy(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + policy_info = "N/A" + logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['dpm_policy'] = policy_info if 'numa' in current_platform_args: if args.numa: try: @@ -762,7 +774,7 @@ class AMDSMICommands(): bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, - interface_ver=None): + interface_ver=None, policy=None): """Get Static information for target gpu and cpu Args: @@ -785,7 +797,7 @@ class AMDSMICommands(): num_vf (bool, optional): Value override for args.num_vf. Defaults to None. cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None. interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None - + policy (bool, optional): Value override for args.policy. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -811,7 +823,7 @@ class AMDSMICommands(): gpu_args_enabled = False gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", "board", "numa", "vram", "cache", "partition", - "dfc_ucode", "fb_info", "num_vf"] + "dfc_ucode", "fb_info", "num_vf", "policy"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr): @@ -841,7 +853,7 @@ class AMDSMICommands(): self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf) + dfc_ucode, fb_info, num_vf, policy) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None: args.cpu = self.cpu_handles @@ -855,7 +867,7 @@ class AMDSMICommands(): self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf) + dfc_ucode, fb_info, num_vf, policy) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -3096,7 +3108,7 @@ class AMDSMICommands(): def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None, power_cap=None): + memory_partition=None, power_cap=None, dpm_policy=None): """Issue reset commands to target gpu(s) Args: @@ -3110,6 +3122,7 @@ class AMDSMICommands(): compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. power_cap (int, optional): Value override for args.power_cap. Defaults to None. + dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3135,7 +3148,8 @@ class AMDSMICommands(): args.memory_partition = memory_partition if power_cap: args.power_cap = power_cap - + if dpm_policy: + args.dpm_policy = dpm_policy # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3154,7 +3168,8 @@ class AMDSMICommands(): args.compute_partition, args.memory_partition, args.perf_determinism is not None, - args.power_cap]): + args.power_cap, + args.dpm_policy]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3218,6 +3233,16 @@ class AMDSMICommands(): raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") + + if args.dpm_policy: + try: + amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") + if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) @@ -3257,7 +3282,7 @@ class AMDSMICommands(): cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, - soc_boost_limit=None, core=None, core_boost_limit=None): + soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None): """Issue reset commands to target gpu(s) Args: @@ -3286,6 +3311,7 @@ class AMDSMICommands(): core (device_handle, optional): device_handle for target core. Defaults to None. core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None + dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3306,7 +3332,7 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap"] + "memory_partition", "power_cap", "dpm_policy"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3367,7 +3393,7 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap) + memory_partition, power_cap, dpm_policy) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3386,7 +3412,7 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap) + memory_partition, power_cap, dpm_policy) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index a685a7e8dd..080cc3538e 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -632,7 +632,6 @@ class AMDSMIHelpers(): compute_partitions_str.remove('INVALID') return compute_partitions_str - def get_memory_partition_types(self): memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType] if 'UNKNOWN' in memory_partitions_str: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index ab283d9c13..5341b27486 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -543,6 +543,7 @@ class AMDSMIParser(argparse.ArgumentParser): vram_help = "All vram information" cache_help = "All cache information" board_help = "All board information" + dpm_policy_help = "The available DPM policy" # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" @@ -582,6 +583,7 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) + static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -963,6 +965,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" set_power_cap_help = "Set power capacity limit" + set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -998,6 +1001,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') + set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID') if self.helpers.is_amd_hsmp_initialized(): # Optional CPU Args diff --git a/projects/amdsmi/example/amd_smi_nodrm_example.cc b/projects/amdsmi/example/amd_smi_nodrm_example.cc index 0f82937569..e6d37cedab 100644 --- a/projects/amdsmi/example/amd_smi_nodrm_example.cc +++ b/projects/amdsmi/example/amd_smi_nodrm_example.cc @@ -331,6 +331,18 @@ int main() { printf(" Output of amdsmi_get_power_cap_info:\n"); std::cout << "\t\t Power Cap: " << cap_info.power_cap / 1000000 << "W\n\n"; + + amdsmi_dpm_policy_t policy; + ret = amdsmi_get_dpm_policy(processor_handles[j], &policy); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + std::cout << "\t amdsmi_get_dpm_policy total:" << policy.num_supported + <<" current:" << policy.current << "\n"; + for (int x=0; x < policy.num_supported; x++) { + std::cout << x <<": (" << policy.policies[x].policy_id + <<"," << policy.policies[x].policy_description << ")\n"; + } + } } } diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 7e283bfb75..ef58a6ce3d 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -151,7 +151,7 @@ typedef enum { #define AMDSMI_LIB_VERSION_YEAR 24 //! Major version should be changed for every header change (adding/deleting APIs, changing names, fields of structures, etc.) -#define AMDSMI_LIB_VERSION_MAJOR 4 +#define AMDSMI_LIB_VERSION_MAJOR 5 //! Minor version should be updated for each API change, but without changing headers #define AMDSMI_LIB_VERSION_MINOR 0 @@ -1151,6 +1151,37 @@ typedef struct { uint64_t frequency[AMDSMI_MAX_NUM_FREQUENCIES]; } amdsmi_frequencies_t; +/** + * @brief The dpm policy. + */ +typedef struct { + uint32_t policy_id; + char policy_description[AMDSMI_MAX_NAME]; +} amdsmi_dpm_policy_entry_t; + +#define AMDSMI_MAX_NUM_PM_POLICIES 32 + +/** + * @brief This structure holds information about dpm policies. + */ +typedef struct { + /** + * The number of supported policies + */ + uint32_t num_supported; + + /** + * The current policy index + */ + uint32_t current; + + /** + * List of policies. + * Only the first num_supported policies are valid. + */ + amdsmi_dpm_policy_entry_t policies[AMDSMI_MAX_NUM_PM_POLICIES]; +} amdsmi_dpm_policy_t; + /** * @brief This structure holds information about the possible PCIe * bandwidths. Specifically, the possible transfer rates and their @@ -3333,6 +3364,47 @@ amdsmi_status_t amdsmi_set_gpu_overdrive_level(amdsmi_processor_handle processor amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, uint64_t freq_bitmask); +/** + * @brief Get the dpm policy for the processor + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle, this function will write + * current dpm policy settings to @p policy. All the processors at the same socket + * will have the same policy. + * + * @param[in] processor_handle a processor handle + * + * @param[in, out] policy the dpm policy for this processor. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy); + +/** + * @brief Set the dpm policy for the processor + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle and a dpm policy @p policy_id, + * this function will set the dpm policy for this processor. All the processors at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] policy_id the dpm policy id to set. The id is the id in + * amdsmi_dpm_policy_entry_t, which can be obtained by calling + * amdsmi_get_dpm_policy() + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, + uint32_t policy_id); /** @} End PerfCont */ /*****************************************************************************/ diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 74a190da06..7eb501bb64 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -360,7 +360,6 @@ class AmdSmiProcessorType(IntEnum): NON_AMD_GPU = amdsmi_wrapper.NON_AMD_GPU NON_AMD_CPU = amdsmi_wrapper.NON_AMD_CPU - class AmdSmiEventReader: def __init__( self, processor_handle: amdsmi_wrapper.amdsmi_processor_handle, @@ -2690,6 +2689,19 @@ def amdsmi_set_clk_freq( ) ) +def amdsmi_set_dpm_policy( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + policy_id: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_dpm_policy( + processor_handle, policy_id + ) + ) def amdsmi_set_gpu_overdrive_level( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int @@ -3249,6 +3261,36 @@ def amdsmi_get_clk_freq( "frequency": list(freq.frequency)[: freq.num_supported - 1], } +def amdsmi_get_dpm_policy( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + policy = amdsmi_wrapper.amdsmi_dpm_policy_t() + _check_res( + amdsmi_wrapper.amdsmi_get_dpm_policy( + processor_handle, ctypes.byref(policy) + ) + ) + + polices = [] + for i in range(0, policy.num_supported): + id = policy.policies[i].policy_id + desc = policy.policies[i].policy_description + polices.append({ + 'policy_id' : id, + 'policy_description': desc.decode() + }) + current_id = policy.policies[policy.current].policy_id + + return { + "num_supported": policy.num_supported, + "current_id": current_id, + "policies": polices, + } def amdsmi_get_gpu_od_volt_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 91bdc8bd4d..8fcdb375b1 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -746,6 +746,19 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + class struct_pcie_metric_(Structure): pass @@ -764,19 +777,6 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 13), ] -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -1480,6 +1480,27 @@ struct_amdsmi_frequencies_t._fields_ = [ ] amdsmi_frequencies_t = struct_amdsmi_frequencies_t +class struct_amdsmi_dpm_policy_entry_t(Structure): + pass + +struct_amdsmi_dpm_policy_entry_t._pack_ = 1 # source:False +struct_amdsmi_dpm_policy_entry_t._fields_ = [ + ('policy_id', ctypes.c_uint32), + ('policy_description', ctypes.c_char * 32), +] + +amdsmi_dpm_policy_entry_t = struct_amdsmi_dpm_policy_entry_t +class struct_amdsmi_dpm_policy_t(Structure): + pass + +struct_amdsmi_dpm_policy_t._pack_ = 1 # source:False +struct_amdsmi_dpm_policy_t._fields_ = [ + ('num_supported', ctypes.c_uint32), + ('current', ctypes.c_uint32), + ('policies', struct_amdsmi_dpm_policy_entry_t * 32), +] + +amdsmi_dpm_policy_t = struct_amdsmi_dpm_policy_t class struct_amdsmi_pcie_bandwidth_t(Structure): pass @@ -2030,6 +2051,12 @@ amdsmi_set_gpu_overdrive_level.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_set_clk_freq = _libraries['libamd_smi.so'].amdsmi_set_clk_freq amdsmi_set_clk_freq.restype = amdsmi_status_t amdsmi_set_clk_freq.argtypes = [amdsmi_processor_handle, amdsmi_clk_type_t, uint64_t] +amdsmi_get_dpm_policy = _libraries['libamd_smi.so'].amdsmi_get_dpm_policy +amdsmi_get_dpm_policy.restype = amdsmi_status_t +amdsmi_get_dpm_policy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)] +amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy +amdsmi_set_dpm_policy.restype = amdsmi_status_t +amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version amdsmi_get_lib_version.restype = amdsmi_status_t amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)] @@ -2486,7 +2513,8 @@ __all__ = \ 'amdsmi_cpu_apb_enable', 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t', - 'amdsmi_dpm_level_t', 'amdsmi_driver_info_t', + 'amdsmi_dpm_level_t', 'amdsmi_dpm_policy_entry_t', + 'amdsmi_dpm_policy_t', 'amdsmi_driver_info_t', 'amdsmi_engine_usage_t', 'amdsmi_error_count_t', 'amdsmi_event_group_t', 'amdsmi_event_handle_t', 'amdsmi_event_type_t', 'amdsmi_evt_notification_data_t', @@ -2516,10 +2544,10 @@ __all__ = \ 'amdsmi_get_cpu_socket_power', 'amdsmi_get_cpu_socket_power_cap', 'amdsmi_get_cpu_socket_power_cap_max', 'amdsmi_get_cpu_socket_temperature', 'amdsmi_get_cpucore_handles', - 'amdsmi_get_cpusocket_handles', 'amdsmi_get_energy_count', - 'amdsmi_get_esmi_err_msg', 'amdsmi_get_fw_info', - 'amdsmi_get_gpu_activity', 'amdsmi_get_gpu_asic_info', - 'amdsmi_get_gpu_available_counters', + 'amdsmi_get_cpusocket_handles', 'amdsmi_get_dpm_policy', + 'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg', + 'amdsmi_get_fw_info', 'amdsmi_get_gpu_activity', + 'amdsmi_get_gpu_asic_info', 'amdsmi_get_gpu_available_counters', 'amdsmi_get_gpu_bad_page_info', 'amdsmi_get_gpu_bdf_id', 'amdsmi_get_gpu_board_info', 'amdsmi_get_gpu_cache_info', 'amdsmi_get_gpu_compute_partition', @@ -2599,7 +2627,8 @@ __all__ = \ 'amdsmi_set_cpu_socket_boostlimit', 'amdsmi_set_cpu_socket_lclk_dpm_level', 'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width', - 'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition', + 'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clk_range', + 'amdsmi_set_gpu_compute_partition', 'amdsmi_set_gpu_event_notification_mask', 'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition', 'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info', @@ -2625,6 +2654,7 @@ __all__ = \ 'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t', 'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t', 'struct_amdsmi_dimm_thermal_t', 'struct_amdsmi_dpm_level_t', + 'struct_amdsmi_dpm_policy_entry_t', 'struct_amdsmi_dpm_policy_t', 'struct_amdsmi_driver_info_t', 'struct_amdsmi_engine_usage_t', 'struct_amdsmi_error_count_t', 'struct_amdsmi_evt_notification_data_t', diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index d42ac466c0..1265421355 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -192,6 +192,39 @@ typedef enum { RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 //!< Unknown performance level } rsmi_dev_perf_level_t; + + +#define RSMI_MAX_NUM_PM_POLICIES 32 +#define RSMI_MAX_POLICY_NAME 32 +/** + * @brief The dpm policy. + */ +typedef struct { + uint32_t policy_id; + char policy_description[RSMI_MAX_POLICY_NAME]; +} rsmi_dpm_policy_entry_t; + +/** + * @brief This structure holds information about dpm policies. + */ +typedef struct { + /** + * The number of supported policies + */ + uint32_t num_supported; + + /** + * The current policy index + */ + uint32_t current; + + /** + * List of policies. + * Only the first num_supported policies are valid. + */ + rsmi_dpm_policy_entry_t policies[RSMI_MAX_NUM_PM_POLICIES]; +} rsmi_dpm_policy_t; + /// \cond Ignore in docs. typedef rsmi_dev_perf_level_t rsmi_dev_perf_level; /// \endcond @@ -3295,6 +3328,42 @@ rsmi_status_t rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od); rsmi_status_t rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, rsmi_clk_type_t clk_type, uint64_t freq_bitmask); +/** + * @brief Get the dpm policy for a device + * + * @details Given a device index @p dv_ind, this function will write + * current dpm policy settings to @p policy. All the devices at the same socket + * will have the same policy. + * + * @param[in] dv_ind a device index + * + * @param[in, out] policy the dpm policy for this device. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVAL + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy); + +/** + * @brief Set the dpm policy for a device + * + * @details Given a device index @p dv_ind and a dpm policy @p policy_id, + * this function will set the DPM policy for this device. All the devices at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] policy_id the dpm policy will be modified + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind, + uint32_t policy_id); + /** @} */ // end of PerfCont /*****************************************************************************/ diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h index 5ca5193b38..3df15f2e51 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -173,6 +173,7 @@ enum DevInfoTypes { kDevNumaNode, kDevGpuMetrics, kDevPmMetrics, + kDevDPMPolicy, kDevRegMetrics, kDevGpuReset, kDevAvailableComputePartition, diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h index a6c3e80c31..67d9d8b8d8 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -78,6 +78,7 @@ int isRegularFile(std::string fname, bool *is_reg); int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); bool IsInteger(const std::string & n_str); +bool stringToInteger(const std::string & n_str, int& value); std::pair executeCommand(std::string command, bool stdOut = true); rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index db5dbcc768..91c8ddbb69 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -145,6 +145,7 @@ static uint64_t get_multiplier_from_str(char units_char) { return multiplier; } + /** * Parse a string of the form: * ": <|*>" @@ -2014,6 +2015,133 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, CATCH } + + +rsmi_status_t +rsmi_dev_dpm_policy_set(uint32_t dv_ind, + uint32_t policy_id) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + std::string value("soc_pstate "); + value += std::to_string(policy_id); + int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + +rsmi_status_t +rsmi_dev_dpm_policy_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy) { + rsmi_status_t ret; + std::vector val_vec; + + if (policy == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + *policy = {}; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + DEVICE_MUTEX + + ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + /* + It will reply on the number but no string as it may vary from soc to soc. + The current pstate marked with * + soc pstate + 0 : soc_pstate_default + 1 : soc_pstate_0 + 2 : soc_pstate_1* + 3 : soc_pstate_2 + */ + bool see_soc_pstate = false; + bool see_current = false; + policy->num_supported = 0; + for (uint32_t i = 0; i < val_vec.size(); ++i) { + auto current_line = amd::smi::trim(val_vec[i]); + if (current_line == "soc pstate") { + see_soc_pstate = true; + continue; + } + if (see_soc_pstate == false) continue; + + // Get tokens: : + std::vector tokens; + std::istringstream f(current_line); + std::string s; + while (getline(f, s, ':')) { + tokens.push_back(s); + } + + int value = 0; + // At the end + if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) { + break; + } + + if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpeced pstat data: the id is negative or too many policies."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + + policy->policies[policy->num_supported].policy_id = value; + std::string description = amd::smi::trim(tokens[1]); + if (current_line.back() == '*') { // current policy + description.pop_back(); // remove last * + description = amd::smi::trim(description); + policy->current = policy->num_supported; + see_current = true; + } + strncpy(policy->policies[policy->num_supported].policy_description, + description.c_str(), + RSMI_MAX_POLICY_NAME-1); + policy->num_supported++; + } // end for + + if (!see_soc_pstate) { + return RSMI_STATUS_NOT_SUPPORTED; + } + + if (!see_current) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpeced pstat data: cannot find the current policy."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + // Cannot find it + return RSMI_STATUS_SUCCESS; + + CATCH +} + static std::vector pci_name_files = { "/usr/share/misc/pci.ids", "/usr/share/hwdata/pci.ids", diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 305364b6e2..3e63659c82 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -136,6 +136,7 @@ static const char *kDevAvailableComputePartitionFName = "available_compute_partition"; static const char *kDevComputePartitionFName = "current_compute_partition"; static const char *kDevMemoryPartitionFName = "current_memory_partition"; +static const char* kDevDPMPolicyFName = "pm_policy"; // The PM policy for pstat and XGMI // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -315,6 +316,7 @@ static const std::map kDevAttribNameMap = { {kDevNumaNode, kDevNumaNodeFName}, {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevPmMetrics, kDevPmMetricsFName}, + {kDevDPMPolicy, kDevDPMPolicyFName}, {kDevRegMetrics, kDevRegMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, @@ -472,6 +474,7 @@ Device::devInfoTypesStrings = { {kDevComputePartition, "kDevComputePartition"}, {kDevMemoryPartition, "kDevMemoryPartition"}, {kDevPCieVendorID, "kDevPCieVendorID"}, + {kDevDPMPolicy, "kDevDPMPolicy"}, }; static const std::map kDevFuncDependsMap = { @@ -533,6 +536,8 @@ static const std::map kDevFuncDependsMap = { {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, {"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}}, + {"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}}, {"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}}, {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, @@ -938,6 +943,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { case kDevPCIEClk: case kDevPowerODVoltage: case kDevSOCClk: + case kDevDPMPolicy: return writeDevInfoStr(type, val); case kDevComputePartition: case kDevMemoryPartition: @@ -1219,6 +1225,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { case kDevErrCntHDP: case kDevErrCntXGMIWAFL: case kDevMemPageBad: + case kDevDPMPolicy: return readDevInfoMultiLineStr(type, val); break; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc index ffa4ef703a..61ec4243dc 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc @@ -257,6 +257,16 @@ bool IsInteger(const std::string & n_str) { return (*tmp == 0); } +bool stringToInteger(const std::string & n_str, int& value) { + try { + value = std::stoi(trim(n_str), nullptr); + return true; + } catch (...) { + return false; + } + return false; +} + rsmi_status_t handleException() { try { throw; diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index dc5f9509b1..392b6188ca 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1352,6 +1352,23 @@ amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, return rsmi_wrapper(rsmi_dev_gpu_clk_freq_set, processor_handle, static_cast(clk_type), freq_bitmask); } + +amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, + uint32_t policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_dpm_policy_set, processor_handle, + policy); +} + +amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_dpm_policy_get, processor_handle, + reinterpret_cast(policy)); +} + amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages,