From b9b958b82c97864fc0bd07873c77b0fe0aa4b8cd Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 20 Mar 2024 12:06:24 -0500 Subject: [PATCH] Get and set the XGMI PLPD Update the API and CLI to support XGMI Per-Link Power Down Policy. Change-Id: Iaf04a771eb8bb0829a5b3088d803a7355a8dfd0b [ROCm/amdsmi commit: e4085c641431fe9c12ec585a91e39a57db54ecd8] --- projects/amdsmi/amdsmi_cli/README.md | 67 +++++++-- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 52 +++++-- projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 4 + projects/amdsmi/include/amd_smi/amdsmi.h | 43 ++++++ projects/amdsmi/py-interface/README.md | 72 +++++++++- .../amdsmi/py-interface/amdsmi_interface.py | 45 ++++++ .../amdsmi/py-interface/amdsmi_wrapper.py | 45 +++--- .../rocm_smi/include/rocm_smi/rocm_smi.h | 39 ++++++ projects/amdsmi/rocm_smi/src/rocm_smi.cc | 128 +++++++++++++++++- .../amdsmi/rocm_smi/src/rocm_smi_device.cc | 6 +- projects/amdsmi/src/amd_smi/amd_smi.cc | 16 +++ 11 files changed, 467 insertions(+), 50 deletions(-) diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md index 3273f8077f..f9c0c06766 100644 --- a/projects/amdsmi/amdsmi_cli/README.md +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -280,7 +280,7 @@ usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE [--core-curr-active-freq-core-limit] [--core-energy] [--json | --csv] [--file FILE] [--loglevel LEVEL] -If no GPU is specified, returns metric information for all GPUs on the system. +If no GPU is specified, returns metric information for all GPUs on the system. If no metric argument is provided all metric information will be displayed. Metric arguments: @@ -325,16 +325,16 @@ CPU Arguments: --cpu-c0-res Displays C0 residency --cpu-lclk-dpm-level NBIOID Displays lclk dpm level range. Requires socket ID and NBOID as inputs --cpu-pwr-svi-telemtry-rails Displays svi based telemetry for all rails - --cpu-io-bandwidth IO_BW LINKID_NAME Displays current IO bandwidth for the selected CPU. - input parameters are bandwidth type(1) and link ID encodings + --cpu-io-bandwidth IO_BW LINKID_NAME Displays current IO bandwidth for the selected CPU. + input parameters are bandwidth type(1) and link ID encodings i.e. P2, P3, G0 - G7 - --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME Displays current XGMI bandwidth for the selected CPU - input parameters are bandwidth type(1,2,4) and link ID encodings + --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME Displays current XGMI bandwidth for the selected CPU + input parameters are bandwidth type(1,2,4) and link ID encodings i.e. P2, P3, G0 - G7 --cpu-metrics-ver Displays metrics table version --cpu-metrics-table Displays metric table --cpu-socket-energy Displays socket energy for the selected CPU socket - --cpu-ddr-bandwidth Displays per socket max ddr bw, current utilized bw, + --cpu-ddr-bandwidth Displays per socket max ddr bw, current utilized bw, and current utilized ddr bw in percentage --cpu-temp Displays cpu socket temperature --cpu-dimm-temp-range-rate DIMM_ADDR Displays dimm temperature range and refresh rate @@ -437,7 +437,7 @@ usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-a] [-w] [-o] [-t] [-b] -If no GPU is specified, returns information for all GPUs on the system. +If no GPU is specified, returns information for all GPUs on the system. If no topology argument is provided all topology information will be displayed. Topology arguments: @@ -483,7 +483,7 @@ usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ... [--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE] [--loglevel LEVEL] -A GPU must be specified to set a configuration. +A GPU must be specified to set a configuration. A set argument must be provided; Multiple set arguments are accepted Set Arguments: @@ -513,11 +513,12 @@ Set Arguments: NPS1, NPS2, NPS4, NPS8 -o, --power-cap WATTS Set power capacity limit -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id + -x, --xgmi-plpd POLICY_ID Set the GPU XGMI per-link power down policy using policy id CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. --cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH Set max and Min linkwidth. Input parameters are min and max link width values - --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO. + --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO. Input parameters are die_index, min dpm, max dpm. --cpu-pwr-eff-mode MODE Sets the power efficency mode policy. Input parameter is mode. --cpu-gmi3-link-width MIN_LW MAX_LW Sets max and min gmi3 link width range @@ -675,7 +676,7 @@ GPU: 0 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -687,6 +688,16 @@ GPU: 0 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 0 AFFINITY: 0 @@ -783,7 +794,7 @@ GPU: 1 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -795,6 +806,16 @@ GPU: 1 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 1 AFFINITY: 1 @@ -891,7 +912,7 @@ GPU: 2 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -903,6 +924,16 @@ GPU: 2 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 2 AFFINITY: 2 @@ -999,7 +1030,7 @@ GPU: 3 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -1011,6 +1042,16 @@ GPU: 3 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 3 AFFINITY: 3 diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index dbdc16acb3..689b3fa55f 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -244,7 +244,8 @@ class AMDSMICommands(): def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, - cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None): + cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, + policy=None, xgmi_plpd=None): """Get Static information for target gpu Args: @@ -268,6 +269,7 @@ class AMDSMICommands(): fb_info (bool, optional): Value override for args.fb_info. Defaults to None. num_vf (bool, optional): Value override for args.num_vf. Defaults to None. policy (bool, optional): Value override for args.policy. Defaults to None. + xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. Returns: None: Print output via AMDSMILogger to destination """ @@ -302,8 +304,10 @@ class AMDSMICommands(): args.limit = limit if policy: args.policy = policy - current_platform_args += ["ras", "limit", "partition", "policy"] - current_platform_values += [args.ras, args.limit, args.partition, args.policy] + if xgmi_plpd: + args.xgmi_plpd = xgmi_plpd + current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -630,6 +634,15 @@ class AMDSMICommands(): logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['dpm_policy'] = policy_info + if 'xgmi_plpd' in current_platform_args: + if args.xgmi_plpd: + try: + policy_info = amdsmi_interface.amdsmi_get_xgmi_plpd(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + policy_info = "N/A" + logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['xgmi_plpd'] = policy_info if 'numa' in current_platform_args: if args.numa: try: @@ -766,7 +779,7 @@ class AMDSMICommands(): bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, - interface_ver=None, policy=None): + interface_ver=None, policy=None, xgmi_plpd = None): """Get Static information for target gpu and cpu Args: @@ -790,6 +803,7 @@ class AMDSMICommands(): cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None. interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None policy (bool, optional): Value override for args.policy. Defaults to None. + xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -815,7 +829,7 @@ class AMDSMICommands(): gpu_args_enabled = False gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", "board", "numa", "vram", "cache", "partition", - "dfc_ucode", "fb_info", "num_vf", "policy"] + "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr): @@ -859,7 +873,7 @@ class AMDSMICommands(): self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf, policy) + dfc_ucode, fb_info, num_vf, policy, xgmi_plpd) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -3090,7 +3104,7 @@ class AMDSMICommands(): def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None, power_cap=None, dpm_policy=None): + memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None): """Issue reset commands to target gpu(s) Args: @@ -3105,6 +3119,7 @@ class AMDSMICommands(): memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. power_cap (int, optional): Value override for args.power_cap. Defaults to None. dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. + xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3132,6 +3147,8 @@ class AMDSMICommands(): args.power_cap = power_cap if dpm_policy: args.dpm_policy = dpm_policy + if xgmi_plpd: + args.xgmi_plpd = xgmi_plpd # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3151,7 +3168,8 @@ class AMDSMICommands(): args.memory_partition, args.perf_determinism is not None, args.power_cap, - args.dpm_policy]): + args.dpm_policy, + args.xgmi_plpd]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3225,6 +3243,15 @@ class AMDSMICommands(): raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") + if args.xgmi_plpd: + try: + amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}") + if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) @@ -3264,7 +3291,7 @@ class AMDSMICommands(): cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, - soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None): + soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None): """Issue reset commands to target gpu(s) Args: @@ -3294,6 +3321,7 @@ class AMDSMICommands(): core (device_handle, optional): device_handle for target core. Defaults to None. core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. + xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3314,7 +3342,7 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap", "dpm_policy"] + "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3370,7 +3398,7 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy) + memory_partition, power_cap, dpm_policy, xgmi_plpd) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3389,7 +3417,7 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy) + memory_partition, power_cap, dpm_policy, xgmi_plpd) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 5341b27486..adaa91c34e 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -544,6 +544,7 @@ class AMDSMIParser(argparse.ArgumentParser): cache_help = "All cache information" board_help = "All board information" dpm_policy_help = "The available DPM policy" + xgmi_plpd_help = "The available XGMI per-link power down policy" # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" @@ -584,6 +585,7 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) + static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -966,6 +968,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" set_power_cap_help = "Set power capacity limit" set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n" + set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -1002,6 +1005,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID') + set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') if self.helpers.is_amd_hsmp_initialized(): # Optional CPU Args diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 861709b98d..64bdb1253a 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -3405,6 +3405,49 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, */ amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, uint32_t policy_id); + +/** + * @brief Get the xgmi per-link power down policy parameter for the processor + * + * @platform{gpu_bm_linux} + * + * @details Given a processor handle @p processor_handle, this function will write + * current xgmi plpd settings to @p policy. All the processors at the same socket + * will have the same policy. + * + * @param[in] processor_handle a processor handle + * + * @param[in, out] policy the xgmi plpd for this processor. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* xgmi_plpd); + +/** + * @brief Set the xgmi per-link power down policy parameter for the processor + * + * @platform{gpu_bm_linux} + * + * @details Given a processor handle @p processor_handle and a dpm policy @p plpd_id, + * this function will set the xgmi plpd for this processor. All the processors at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in + * amdsmi_dpm_policy_entry_t, which can be obtained by calling + * amdsmi_get_xgmi_plpd() + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle, + uint32_t plpd_id); + /** @} End PerfCont */ /*****************************************************************************/ diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 7d9fd5908e..82f8ca974a 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -909,8 +909,8 @@ Field | Description `name` | Name of process `pid` | Process ID `mem` | Process memory usage -`engine_usage`|
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
-`memory_usage`|
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
+`engine_usage` |
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
+`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function: @@ -2612,6 +2612,74 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_set_xgmi_plpd + +Description: Set the xgmi per-link power down policy parameter for the processor + +Input parameters: + +* `processor_handle` handle for the given device +* `policy_id` the xgmi plpd id to set. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_xgmi_plpd` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_xgmi_plpd(device, 0) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_get_xgmi_plpd + +Description: Get the xgmi per-link power down policy parameter for the processor + +Input parameters: + +* `processor_handle` handle for the given device + +Output: Dict containing information about xgmi per-link power down policy + +Field | Description +---|--- +`num_supported` | The number of supported policies +`current_id` | The current policy index +`plpds` | List of policies. + +Exceptions that can be thrown by `amdsmi_get_xgmi_plpd` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + xgmi_plpd = amdsmi_get_xgmi_plpd(device) + print(xgmi_plpd) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_set_gpu_overdrive_level Description: **deprecated** Set the overdrive percent associated with the diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index e27451dab4..c9e773b88f 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -2746,6 +2746,20 @@ def amdsmi_set_dpm_policy( ) ) +def amdsmi_set_xgmi_plpd( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + policy_id: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_xgmi_plpd( + processor_handle, policy_id + ) + ) + def amdsmi_set_gpu_overdrive_level( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int ): @@ -3335,6 +3349,37 @@ def amdsmi_get_dpm_policy( "policies": polices, } +def amdsmi_get_xgmi_plpd( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + policy = amdsmi_wrapper.amdsmi_dpm_policy_t() + _check_res( + amdsmi_wrapper.amdsmi_get_xgmi_plpd( + processor_handle, ctypes.byref(policy) + ) + ) + + polices = [] + for i in range(0, policy.num_supported): + id = policy.policies[i].policy_id + desc = policy.policies[i].policy_description + polices.append({ + 'policy_id' : id, + 'policy_description': desc.decode() + }) + current_id = policy.policies[policy.current].policy_id + + return { + "num_supported": policy.num_supported, + "current_id": current_id, + "plpds": polices, + } + def amdsmi_get_gpu_od_volt_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index f718dcfa87..13cd2062ac 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -746,19 +746,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - class struct_pcie_metric_(Structure): pass @@ -777,6 +764,19 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 13), ] +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -2058,6 +2058,12 @@ amdsmi_get_dpm_policy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy amdsmi_set_dpm_policy.restype = amdsmi_status_t amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t] +amdsmi_get_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_get_xgmi_plpd +amdsmi_get_xgmi_plpd.restype = amdsmi_status_t +amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)] +amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd +amdsmi_set_xgmi_plpd.restype = amdsmi_status_t +amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version amdsmi_get_lib_version.restype = amdsmi_status_t amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)] @@ -2594,8 +2600,9 @@ __all__ = \ 'amdsmi_get_processor_info', 'amdsmi_get_processor_type', 'amdsmi_get_socket_handles', 'amdsmi_get_socket_info', 'amdsmi_get_temp_metric', 'amdsmi_get_utilization_count', - 'amdsmi_get_xgmi_info', 'amdsmi_gpu_block_t', - 'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter', + 'amdsmi_get_xgmi_info', 'amdsmi_get_xgmi_plpd', + 'amdsmi_gpu_block_t', 'amdsmi_gpu_cache_info_t', + 'amdsmi_gpu_control_counter', 'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter', 'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t', 'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status', @@ -2636,10 +2643,10 @@ __all__ = \ 'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth', 'amdsmi_set_gpu_perf_determinism_mode', 'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile', - 'amdsmi_set_power_cap', 'amdsmi_shut_down', - 'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle', - 'amdsmi_status_code_to_string', 'amdsmi_status_t', - 'amdsmi_stop_gpu_event_notification', + 'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd', + 'amdsmi_shut_down', 'amdsmi_smu_fw_version_t', + 'amdsmi_socket_handle', 'amdsmi_status_code_to_string', + 'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification', 'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t', 'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type', 'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number', diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 1265421355..e10ab49b34 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3364,6 +3364,45 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind, uint32_t policy_id); +/** + * @brief Get the xgmi per-link power down policy parameter for a device + * + * + * @details Given a device index @p dv_ind, this function will write + * current xgmi plpd settings to @p xgmi_plpd. All the processors at the same socket + * will have the same policy. + * + * @param[in] dv_ind a device index + * + * @param[in, out] xgmi_plpd the xgmi_plpd policy for this device. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVAL + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind, + rsmi_dpm_policy_t* xgmi_plpd); + +/** + * @brief Set the xgmi per-link power down policy parameter for a device + * + * + * @details Given a device index @p dv_ind, and a dpm policy @p plpd_id, + * this function will set the xgmi plpd for this processor. All the processors at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in + * rsmi_dpm_policy_entry_t, which can be obtained by calling + * rsmi_dev_xgmi_plpd_get() + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind, + uint32_t plpd_id); /** @} */ // end of PerfCont /*****************************************************************************/ diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index 91c8ddbb69..6aa0d86fce 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -2038,6 +2038,130 @@ rsmi_dev_dpm_policy_set(uint32_t dv_ind, CATCH } +rsmi_status_t +rsmi_dev_xgmi_plpd_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy) { + rsmi_status_t ret; + std::vector val_vec; + + if (policy == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + *policy = {}; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + DEVICE_MUTEX + + ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + /* + It will reply on the number but no string as it may vary from soc to soc. + The current xmgi plpd marked with * + xgmi plpd + 0 : plpd_disallow + 1 : plpd_default + 2 : plpd_optimized* + */ + bool see_plpd_pstate = false; + bool see_current = false; + policy->num_supported = 0; + for (uint32_t i = 0; i < val_vec.size(); ++i) { + auto current_line = amd::smi::trim(val_vec[i]); + if (current_line == "xgmi plpd") { + see_plpd_pstate = true; + continue; + } + if (see_plpd_pstate == false) continue; + + // Get tokens: : + std::vector tokens; + std::istringstream f(current_line); + std::string s; + while (getline(f, s, ':')) { + tokens.push_back(s); + } + + int value = 0; + // At the end + if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) { + break; + } + + if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpected pstat data: the id is negative or too many plpd policies."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + + policy->policies[policy->num_supported].policy_id = value; + std::string description = amd::smi::trim(tokens[1]); + if (current_line.back() == '*') { // current policy + description.pop_back(); // remove last * + description = amd::smi::trim(description); + policy->current = policy->num_supported; + see_current = true; + } + strncpy(policy->policies[policy->num_supported].policy_description, + description.c_str(), + RSMI_MAX_POLICY_NAME-1); + policy->num_supported++; + } // end for + + if (!see_plpd_pstate) { + return RSMI_STATUS_NOT_SUPPORTED; + } + + if (!see_current) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpected pstat data: cannot find the current plpd policy."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + // Cannot find it + return RSMI_STATUS_SUCCESS; + + CATCH +} + +rsmi_status_t +rsmi_dev_xgmi_plpd_set(uint32_t dv_ind, + uint32_t plpd_id) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + std::string value("xgmi "); + value += std::to_string(plpd_id); + int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, rsmi_dpm_policy_t* policy) { @@ -2107,7 +2231,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind, if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", Unexpeced pstat data: the id is negative or too many policies."; + << ", Unexpected pstat data: the id is negative or too many policies."; LOG_ERROR(ss); return RSMI_STATUS_UNEXPECTED_DATA; } @@ -2132,7 +2256,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind, if (!see_current) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", Unexpeced pstat data: cannot find the current policy."; + << ", Unexpected pstat data: cannot find the current policy."; LOG_ERROR(ss); return RSMI_STATUS_UNEXPECTED_DATA; } diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 3e63659c82..92de58c6a1 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -536,8 +536,10 @@ static const std::map kDevFuncDependsMap = { {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, {"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}}, - {"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}}, - {"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_xgmi_plpd_get", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_xgmi_plpd_set", {{kDevDPMPolicyFName}, {}}}, {"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}}, {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index e57ae30cbb..1dafee87ff 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1369,6 +1369,22 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, reinterpret_cast(policy)); } +amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle, + uint32_t policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle, + policy); +} + +amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle, + reinterpret_cast(policy)); +} + amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages,