Get and set the XGMI PLPD

Update the API and CLI to support XGMI Per-Link Power Down Policy. Change-Id: Iaf04a771eb8bb0829a5b3088d803a7355a8dfd0b [ROCm/amdsmi commit: e4085c6414]
2024-03-20 12:06:24 -05:00
@@ -280,7 +280,7 @@ usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE
                      [--core-curr-active-freq-core-limit] [--core-energy]
                      [--json | --csv] [--file FILE] [--loglevel LEVEL]

-If no GPU is specified, returns metric information for all GPUs on the system.                                
+If no GPU is specified, returns metric information for all GPUs on the system.
 If no metric argument is provided all metric information will be displayed.

 Metric arguments:
@@ -325,16 +325,16 @@ CPU Arguments:
  --cpu-c0-res                              Displays C0 residency
  --cpu-lclk-dpm-level NBIOID               Displays lclk dpm level range. Requires socket ID and NBOID as inputs
  --cpu-pwr-svi-telemtry-rails              Displays svi based telemetry for all rails
-  --cpu-io-bandwidth IO_BW LINKID_NAME      Displays current IO bandwidth for the selected CPU.        
-                                             input parameters are bandwidth type(1) and link ID encodings        
+  --cpu-io-bandwidth IO_BW LINKID_NAME      Displays current IO bandwidth for the selected CPU.
+                                             input parameters are bandwidth type(1) and link ID encodings
                                             i.e. P2, P3, G0 - G7
-  --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME  Displays current XGMI bandwidth for the selected CPU        
-                                             input parameters are bandwidth type(1,2,4) and link ID encodings        
+  --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME  Displays current XGMI bandwidth for the selected CPU
+                                             input parameters are bandwidth type(1,2,4) and link ID encodings
                                             i.e. P2, P3, G0 - G7
  --cpu-metrics-ver                         Displays metrics table version
  --cpu-metrics-table                       Displays metric table
  --cpu-socket-energy                       Displays socket energy for the selected CPU socket
-  --cpu-ddr-bandwidth                       Displays per socket max ddr bw, current utilized bw,        
+  --cpu-ddr-bandwidth                       Displays per socket max ddr bw, current utilized bw,
                                             and current utilized ddr bw in percentage
  --cpu-temp                                Displays cpu socket temperature
  --cpu-dimm-temp-range-rate DIMM_ADDR      Displays dimm temperature range and refresh rate
@@ -437,7 +437,7 @@ usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
                        [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-a]
                        [-w] [-o] [-t] [-b]

-If no GPU is specified, returns information for all GPUs on the system.                                
+If no GPU is specified, returns information for all GPUs on the system.
 If no topology argument is provided all topology information will be displayed.

 Topology arguments:
@@ -483,7 +483,7 @@ usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...
                   [--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE]
                   [--loglevel LEVEL]

-A GPU must be specified to set a configuration.                                    
+A GPU must be specified to set a configuration.
 A set argument must be provided; Multiple set arguments are accepted

 Set Arguments:
@@ -513,11 +513,12 @@ Set Arguments:
                                                NPS1, NPS2, NPS4, NPS8
  -o, --power-cap WATTS                        Set power capacity limit
  -p, --dpm-policy POLICY_ID                   Set the GPU DPM policy using policy id
+  -x, --xgmi-plpd POLICY_ID                    Set the GPU XGMI per-link power down policy using policy id

 CPU Arguments:
  --cpu-pwr-limit PWR_LIMIT                    Set power limit for the given socket. Input parameter is power limit value.
  --cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH    Set max and Min linkwidth. Input parameters are min and max link width values
-  --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM  Sets the max and min dpm level on a given NBIO.        
+  --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM  Sets the max and min dpm level on a given NBIO.
                                                Input parameters are die_index, min dpm, max dpm.
  --cpu-pwr-eff-mode MODE                      Sets the power efficency mode policy. Input parameter is mode.
  --cpu-gmi3-link-width MIN_LW MAX_LW          Sets max and min gmi3 link width range
@@ -675,7 +676,7 @@ GPU: 0
    PARTITION:
        COMPUTE_PARTITION: SPX
        MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
        NUM_SUPPORTED: 4
        CURRENT_ID: 1
        POLICIES:
@@ -687,6 +688,16 @@ GPU: 0
            POLICY_DESCRIPTION: soc_pstate_1
            POLICY_ID: 3
            POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
    NUMA:
        NODE: 0
        AFFINITY: 0
@@ -783,7 +794,7 @@ GPU: 1
    PARTITION:
        COMPUTE_PARTITION: SPX
        MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
        NUM_SUPPORTED: 4
        CURRENT_ID: 1
        POLICIES:
@@ -795,6 +806,16 @@ GPU: 1
            POLICY_DESCRIPTION: soc_pstate_1
            POLICY_ID: 3
            POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
    NUMA:
        NODE: 1
        AFFINITY: 1
@@ -891,7 +912,7 @@ GPU: 2
    PARTITION:
        COMPUTE_PARTITION: SPX
        MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
        NUM_SUPPORTED: 4
        CURRENT_ID: 1
        POLICIES:
@@ -903,6 +924,16 @@ GPU: 2
            POLICY_DESCRIPTION: soc_pstate_1
            POLICY_ID: 3
            POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
    NUMA:
        NODE: 2
        AFFINITY: 2
@@ -999,7 +1030,7 @@ GPU: 3
    PARTITION:
        COMPUTE_PARTITION: SPX
        MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
        NUM_SUPPORTED: 4
        CURRENT_ID: 1
        POLICIES:
@@ -1011,6 +1042,16 @@ GPU: 3
            POLICY_DESCRIPTION: soc_pstate_1
            POLICY_ID: 3
            POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
    NUMA:
        NODE: 3
        AFFINITY: 3
@@ -244,7 +244,8 @@ class AMDSMICommands():

    def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None,
                        limit=None, driver=None, ras=None, board=None, numa=None, vram=None,
-                        cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None):
+                        cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None,
+                        policy=None, xgmi_plpd=None):
        """Get Static information for target gpu

        Args:
@@ -268,6 +269,7 @@ class AMDSMICommands():
            fb_info (bool, optional): Value override for args.fb_info. Defaults to None.
            num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
            policy (bool, optional): Value override for args.policy. Defaults to None.
+            xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
        Returns:
            None: Print output via AMDSMILogger to destination
        """
@@ -302,8 +304,10 @@ class AMDSMICommands():
                args.limit = limit
            if policy:
                args.policy = policy
-            current_platform_args += ["ras", "limit", "partition", "policy"]
-            current_platform_values += [args.ras, args.limit, args.partition, args.policy]
+            if xgmi_plpd:
+                args.xgmi_plpd = xgmi_plpd
+            current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"]
+            current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd]

        if self.helpers.is_linux() and not self.helpers.is_virtual_os():
            if numa:
@@ -630,6 +634,15 @@ class AMDSMICommands():
                    logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info())

                static_dict['dpm_policy'] = policy_info
+        if 'xgmi_plpd' in current_platform_args:
+            if args.xgmi_plpd:
+                try:
+                    policy_info = amdsmi_interface.amdsmi_get_xgmi_plpd(args.gpu)
+                except amdsmi_exception.AmdSmiLibraryException as e:
+                    policy_info = "N/A"
+                    logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info())
+
+                static_dict['xgmi_plpd'] = policy_info
        if 'numa' in current_platform_args:
            if args.numa:
                try:
@@ -766,7 +779,7 @@ class AMDSMICommands():
                bus=None, vbios=None, limit=None, driver=None, ras=None,
                board=None, numa=None, vram=None, cache=None, partition=None,
                dfc_ucode=None, fb_info=None, num_vf=None, cpu=None,
-                interface_ver=None, policy=None):
+                interface_ver=None, policy=None, xgmi_plpd = None):
        """Get Static information for target gpu and cpu

        Args:
@@ -790,6 +803,7 @@ class AMDSMICommands():
            cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None.
            interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None
            policy (bool, optional): Value override for args.policy. Defaults to None.
+            xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
        Raises:
            IndexError: Index error if gpu list is empty

@@ -815,7 +829,7 @@ class AMDSMICommands():
        gpu_args_enabled = False
        gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras",
                          "board", "numa", "vram", "cache", "partition",
-                          "dfc_ucode", "fb_info", "num_vf", "policy"]
+                          "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"]
        for attr in gpu_attributes:
            if hasattr(args, attr):
                if getattr(args, attr):
@@ -859,7 +873,7 @@ class AMDSMICommands():
            self.static_gpu(args, multiple_devices, gpu, asic,
                                bus, vbios, limit, driver, ras,
                                board, numa, vram, cache, partition,
-                                dfc_ucode, fb_info, num_vf, policy)
+                                dfc_ucode, fb_info, num_vf, policy, xgmi_plpd)


    def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True):
@@ -3090,7 +3104,7 @@ class AMDSMICommands():

    def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
                  profile=None, perf_determinism=None, compute_partition=None,
-                  memory_partition=None, power_cap=None, dpm_policy=None):
+                  memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None):
        """Issue reset commands to target gpu(s)

        Args:
@@ -3105,6 +3119,7 @@ class AMDSMICommands():
            memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
            power_cap (int, optional): Value override for args.power_cap. Defaults to None.
            dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
+            xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.

        Raises:
            ValueError: Value error if no gpu value is provided
@@ -3132,6 +3147,8 @@ class AMDSMICommands():
            args.power_cap = power_cap
        if dpm_policy:
            args.dpm_policy = dpm_policy
+        if xgmi_plpd:
+            args.xgmi_plpd = xgmi_plpd
        # Handle No GPU passed
        if args.gpu == None:
            raise ValueError('No GPU provided, specific GPU target(s) are needed')
@@ -3151,7 +3168,8 @@ class AMDSMICommands():
                    args.memory_partition,
                    args.perf_determinism is not None,
                    args.power_cap,
-                    args.dpm_policy]):
+                    args.dpm_policy,
+                    args.xgmi_plpd]):
            command = " ".join(sys.argv[1:])
            raise AmdSmiRequiredCommandException(command, self.logger.format)

@@ -3225,6 +3243,15 @@ class AMDSMICommands():
                raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e
            self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}")

+        if args.xgmi_plpd:
+            try:
+                amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd)
+            except amdsmi_exception.AmdSmiLibraryException as e:
+                if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+                    raise PermissionError('Command requires elevation') from e
+                raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e
+            self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}")
+
        if isinstance(args.power_cap, int):
            try:
                power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
@@ -3264,7 +3291,7 @@ class AMDSMICommands():
                  cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None,
                  cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None,
                  cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None,
-                  soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None):
+                  soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None):
        """Issue reset commands to target gpu(s)

        Args:
@@ -3294,6 +3321,7 @@ class AMDSMICommands():
            core (device_handle, optional): device_handle for target core. Defaults to None.
            core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None
            dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
+            xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.

        Raises:
            ValueError: Value error if no gpu value is provided
@@ -3314,7 +3342,7 @@ class AMDSMICommands():
        # Check if a GPU argument has been set
        gpu_args_enabled = False
        gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
-                          "memory_partition", "power_cap", "dpm_policy"]
+                          "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"]
        for attr in gpu_attributes:
            if hasattr(args, attr):
                if getattr(args, attr) is not None:
@@ -3370,7 +3398,7 @@ class AMDSMICommands():
                self.logger.clear_multiple_devices_ouput()
                self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
                                profile, perf_determinism, compute_partition,
-                                memory_partition, power_cap, dpm_policy)
+                                memory_partition, power_cap, dpm_policy, xgmi_plpd)
        elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
            if args.cpu == None and args.core == None:
                raise ValueError('No CPU or CORE provided, specific target(s) are needed')
@@ -3389,7 +3417,7 @@ class AMDSMICommands():
            self.logger.clear_multiple_devices_ouput()
            self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
                            profile, perf_determinism, compute_partition,
-                            memory_partition, power_cap, dpm_policy)
+                            memory_partition, power_cap, dpm_policy, xgmi_plpd)


    def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
@@ -544,6 +544,7 @@ class AMDSMIParser(argparse.ArgumentParser):
        cache_help = "All cache information"
        board_help = "All board information"
        dpm_policy_help = "The available DPM policy"
+        xgmi_plpd_help = "The available XGMI per-link power down policy"

        # Options arguments help text for Hypervisors and Baremetal
        ras_help = "Displays RAS features information"
@@ -584,6 +585,7 @@ class AMDSMIParser(argparse.ArgumentParser):
                static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help)
                static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
                static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help)
+                static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help)

            if self.helpers.is_linux() and not self.helpers.is_virtual_os():
                static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
@@ -966,6 +968,7 @@ class AMDSMIParser(argparse.ArgumentParser):
        set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
        set_power_cap_help = "Set power capacity limit"
        set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n"
+        set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n"

        # Help text for CPU set options
        set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value."
@@ -1002,6 +1005,7 @@ class AMDSMIParser(argparse.ArgumentParser):
            set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
            set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')
            set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False,  type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID')
+            set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False,  type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID')

        if self.helpers.is_amd_hsmp_initialized():
            # Optional CPU Args
@@ -3405,6 +3405,49 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
 */
 amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle,
                             uint32_t policy_id);
+
+/**
+ * @brief Get the xgmi per-link power down policy parameter for the processor
+ *
+ * @platform{gpu_bm_linux}
+ *
+ * @details Given a processor handle @p processor_handle, this function will write
+ * current xgmi plpd settings to @p policy. All the processors at the same socket
+ * will have the same policy.
+ *
+ *  @param[in] processor_handle a processor handle
+ *
+ *  @param[in, out] policy the xgmi plpd for this processor.
+ *  If this parameter is nullptr, this function will return
+ *  ::AMDSMI_STATUS_INVAL
+ *
+ *  @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
+ */
+amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                             amdsmi_dpm_policy_t* xgmi_plpd);
+
+/**
+ * @brief Set the xgmi per-link power down policy parameter for the processor
+ *
+ * @platform{gpu_bm_linux}
+ *
+ * @details Given a processor handle @p processor_handle and a dpm policy @p plpd_id,
+ * this function will set the xgmi plpd for this processor. All the processors at
+ * the same socket will be set to the same policy.
+ *
+ *  @note This function requires root access
+ *
+ *  @param[in] processor_handle a processor handle
+ *
+ *  @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in
+ *  amdsmi_dpm_policy_entry_t, which can be obtained by calling
+ *  amdsmi_get_xgmi_plpd()
+ *
+ *  @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
+ */
+amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                             uint32_t plpd_id);
+
 /** @} End PerfCont */

 /*****************************************************************************/
@@ -909,8 +909,8 @@ Field | Description
 `name` | Name of process
 `pid` | Process ID
 `mem` | Process memory usage
-`engine_usage`| <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
-`memory_usage`| <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
+`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
+`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>

 Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function:

@@ -2612,6 +2612,74 @@ except AmdSmiException as e:
    print(e)
 ```

+### amdsmi_set_xgmi_plpd
+
+Description: Set the xgmi per-link power down policy parameter for the processor
+
+Input parameters:
+
+* `processor_handle` handle for the given device
+* `policy_id` the xgmi plpd id to set.
+
+Output: None
+
+Exceptions that can be thrown by `amdsmi_set_xgmi_plpd` function:
+
+* `AmdSmiLibraryException`
+* `AmdSmiRetryException`
+* `AmdSmiParameterException`
+
+Example:
+
+```python
+try:
+    devices = amdsmi_get_processor_handles()
+    if len(devices) == 0:
+        print("No GPUs on machine")
+    else:
+        for device in devices:
+            amdsmi_set_xgmi_plpd(device, 0)
+except AmdSmiException as e:
+    print(e)
+```
+
+### amdsmi_get_xgmi_plpd
+
+Description: Get the xgmi per-link power down policy parameter for the processor
+
+Input parameters:
+
+* `processor_handle` handle for the given device
+
+Output: Dict containing information about xgmi per-link power down policy
+
+Field | Description
+---|---
+`num_supported` | The number of supported policies
+`current_id` | The current policy index
+`plpds` | List of policies.
+
+Exceptions that can be thrown by `amdsmi_get_xgmi_plpd` function:
+
+* `AmdSmiLibraryException`
+* `AmdSmiRetryException`
+* `AmdSmiParameterException`
+
+Example:
+
+```python
+try:
+    devices = amdsmi_get_processor_handles()
+    if len(devices) == 0:
+        print("No GPUs on machine")
+    else:
+        for device in devices:
+            xgmi_plpd =  amdsmi_get_xgmi_plpd(device)
+            print(xgmi_plpd)
+except AmdSmiException as e:
+    print(e)
+```
+
 ### amdsmi_set_gpu_overdrive_level

 Description: **deprecated** Set the overdrive percent associated with the
@@ -2746,6 +2746,20 @@ def amdsmi_set_dpm_policy(
        )
    )

+def amdsmi_set_xgmi_plpd(
+    processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
+    policy_id: int,
+):
+    if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
+        raise AmdSmiParameterException(
+            processor_handle, amdsmi_wrapper.amdsmi_processor_handle
+        )
+    _check_res(
+        amdsmi_wrapper.amdsmi_set_xgmi_plpd(
+            processor_handle, policy_id
+        )
+    )
+
 def amdsmi_set_gpu_overdrive_level(
    processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int
 ):
@@ -3335,6 +3349,37 @@ def amdsmi_get_dpm_policy(
        "policies": polices,
    }

+def amdsmi_get_xgmi_plpd(
+    processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
+) -> Dict[str, Any]:
+    if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
+        raise AmdSmiParameterException(
+            processor_handle, amdsmi_wrapper.amdsmi_processor_handle
+        )
+
+    policy = amdsmi_wrapper.amdsmi_dpm_policy_t()
+    _check_res(
+        amdsmi_wrapper.amdsmi_get_xgmi_plpd(
+            processor_handle, ctypes.byref(policy)
+        )
+    )
+
+    polices = []
+    for i in range(0, policy.num_supported):
+        id = policy.policies[i].policy_id
+        desc = policy.policies[i].policy_description
+        polices.append({
+            'policy_id' : id,
+            'policy_description': desc.decode()
+        })
+    current_id = policy.policies[policy.current].policy_id
+
+    return  {
+        "num_supported": policy.num_supported,
+        "current_id": current_id,
+        "plpds": polices,
+    }
+
 def amdsmi_get_gpu_od_volt_info(
    processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
 ) -> Dict[str, Any]:
@@ -746,19 +746,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
 class struct_amdsmi_pcie_info_t(Structure):
    pass

-class struct_pcie_static_(Structure):
-    pass
-
-struct_pcie_static_._pack_ = 1 # source:False
-struct_pcie_static_._fields_ = [
-    ('max_pcie_width', ctypes.c_uint16),
-    ('PADDING_0', ctypes.c_ubyte * 2),
-    ('max_pcie_speed', ctypes.c_uint32),
-    ('pcie_interface_version', ctypes.c_uint32),
-    ('slot_type', amdsmi_card_form_factor_t),
-    ('reserved', ctypes.c_uint64 * 10),
-]
-
 class struct_pcie_metric_(Structure):
    pass

@@ -777,6 +764,19 @@ struct_pcie_metric_._fields_ = [
    ('reserved', ctypes.c_uint64 * 13),
 ]

+class struct_pcie_static_(Structure):
+    pass
+
+struct_pcie_static_._pack_ = 1 # source:False
+struct_pcie_static_._fields_ = [
+    ('max_pcie_width', ctypes.c_uint16),
+    ('PADDING_0', ctypes.c_ubyte * 2),
+    ('max_pcie_speed', ctypes.c_uint32),
+    ('pcie_interface_version', ctypes.c_uint32),
+    ('slot_type', amdsmi_card_form_factor_t),
+    ('reserved', ctypes.c_uint64 * 10),
+]
+
 struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
 struct_amdsmi_pcie_info_t._fields_ = [
    ('pcie_static', struct_pcie_static_),
@@ -2058,6 +2058,12 @@ amdsmi_get_dpm_policy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct
 amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy
 amdsmi_set_dpm_policy.restype = amdsmi_status_t
 amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t]
+amdsmi_get_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_get_xgmi_plpd
+amdsmi_get_xgmi_plpd.restype = amdsmi_status_t
+amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)]
+amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd
+amdsmi_set_xgmi_plpd.restype = amdsmi_status_t
+amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t]
 amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version
 amdsmi_get_lib_version.restype = amdsmi_status_t
 amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)]
@@ -2594,8 +2600,9 @@ __all__ = \
    'amdsmi_get_processor_info', 'amdsmi_get_processor_type',
    'amdsmi_get_socket_handles', 'amdsmi_get_socket_info',
    'amdsmi_get_temp_metric', 'amdsmi_get_utilization_count',
-    'amdsmi_get_xgmi_info', 'amdsmi_gpu_block_t',
-    'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter',
+    'amdsmi_get_xgmi_info', 'amdsmi_get_xgmi_plpd',
+    'amdsmi_gpu_block_t', 'amdsmi_gpu_cache_info_t',
+    'amdsmi_gpu_control_counter',
    'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter',
    'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t',
    'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status',
@@ -2636,10 +2643,10 @@ __all__ = \
    'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth',
    'amdsmi_set_gpu_perf_determinism_mode',
    'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile',
-    'amdsmi_set_power_cap', 'amdsmi_shut_down',
-    'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle',
-    'amdsmi_status_code_to_string', 'amdsmi_status_t',
-    'amdsmi_stop_gpu_event_notification',
+    'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd',
+    'amdsmi_shut_down', 'amdsmi_smu_fw_version_t',
+    'amdsmi_socket_handle', 'amdsmi_status_code_to_string',
+    'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification',
    'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t',
    'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type',
    'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number',
@@ -3364,6 +3364,45 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind,
 rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind,
                             uint32_t policy_id);

+/**
+ * @brief Get the xgmi per-link power down policy parameter for a device
+ *
+ *
+ * @details Given a device index @p dv_ind, this function will write
+ * current xgmi plpd settings to @p xgmi_plpd. All the processors at the same socket
+ * will have the same policy.
+ *
+ *  @param[in] dv_ind a device index
+ *
+ *  @param[in, out] xgmi_plpd the xgmi_plpd policy for this device.
+ *  If this parameter is nullptr, this function will return
+ *  ::RSMI_STATUS_INVAL
+ *
+ *  @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
+ */
+rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind,
+                             rsmi_dpm_policy_t* xgmi_plpd);
+
+/**
+ * @brief Set the xgmi per-link power down policy parameter for a device
+ *
+ *
+ * @details  Given a device index @p dv_ind, and a dpm policy @p plpd_id,
+ * this function will set the xgmi plpd for this processor. All the processors at
+ * the same socket will be set to the same policy.
+ *
+ *  @note This function requires root access
+ *
+ *  @param[in] processor_handle a processor handle
+ *
+ *  @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in
+ *  rsmi_dpm_policy_entry_t, which can be obtained by calling
+ *  rsmi_dev_xgmi_plpd_get()
+ *
+ *  @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
+ */
+rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind,
+                             uint32_t plpd_id);
 /** @} */  // end of PerfCont

 /*****************************************************************************/
@@ -2038,6 +2038,130 @@ rsmi_dev_dpm_policy_set(uint32_t dv_ind,
  CATCH
 }

+rsmi_status_t
+rsmi_dev_xgmi_plpd_get(uint32_t dv_ind,
+                      rsmi_dpm_policy_t* policy) {
+  rsmi_status_t ret;
+  std::vector<std::string> val_vec;
+
+  if (policy == nullptr) {
+    return RSMI_STATUS_INVALID_ARGS;
+  }
+
+  *policy = {};
+
+  TRY
+  std::ostringstream ss;
+  ss << __PRETTY_FUNCTION__ << " | ======= start =======";
+  LOG_TRACE(ss);
+  DEVICE_MUTEX
+
+  ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec);
+  if (ret == RSMI_STATUS_FILE_ERROR) {
+    ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+       << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR "
+       << "-> reporting RSMI_STATUS_NOT_SUPPORTED";
+    LOG_ERROR(ss);
+    return RSMI_STATUS_NOT_SUPPORTED;
+  }
+  if (ret != RSMI_STATUS_SUCCESS) {
+    ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+       << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS"
+       << " -> reporting " << amd::smi::getRSMIStatusString(ret);
+    LOG_ERROR(ss);
+    return ret;
+  }
+  /*
+    It will reply on the number but no string as it may vary from soc to soc.
+    The current xmgi plpd marked with *
+    xgmi plpd
+    0 : plpd_disallow
+    1 : plpd_default
+    2 : plpd_optimized*
+  */
+  bool see_plpd_pstate = false;
+  bool see_current = false;
+  policy->num_supported = 0;
+  for (uint32_t i = 0; i < val_vec.size(); ++i) {
+    auto current_line = amd::smi::trim(val_vec[i]);
+    if (current_line == "xgmi plpd") {
+      see_plpd_pstate = true;
+      continue;
+    }
+    if (see_plpd_pstate == false) continue;
+
+    // Get tokens: <integer> : <string *>
+    std::vector<std::string> tokens;
+    std::istringstream f(current_line);
+    std::string s;
+    while (getline(f, s, ':')) {
+          tokens.push_back(s);
+    }
+
+    int value = 0;
+    // At the end
+    if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) {
+      break;
+    }
+
+    if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) {
+      ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+          << ", Unexpected pstat data: the id is negative or too many plpd policies.";
+          LOG_ERROR(ss);
+          return RSMI_STATUS_UNEXPECTED_DATA;
+    }
+
+    policy->policies[policy->num_supported].policy_id = value;
+    std::string description = amd::smi::trim(tokens[1]);
+    if (current_line.back() == '*') {  // current policy
+        description.pop_back();  // remove last *
+        description = amd::smi::trim(description);
+        policy->current = policy->num_supported;
+        see_current = true;
+    }
+    strncpy(policy->policies[policy->num_supported].policy_description,
+          description.c_str(),
+          RSMI_MAX_POLICY_NAME-1);
+    policy->num_supported++;
+  }  //  end for
+
+  if (!see_plpd_pstate) {
+    return RSMI_STATUS_NOT_SUPPORTED;
+  }
+
+  if (!see_current) {
+      ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+          << ", Unexpected pstat data: cannot find the current plpd policy.";
+          LOG_ERROR(ss);
+          return RSMI_STATUS_UNEXPECTED_DATA;
+  }
+  // Cannot find it
+  return RSMI_STATUS_SUCCESS;
+
+  CATCH
+}
+
+rsmi_status_t
+rsmi_dev_xgmi_plpd_set(uint32_t dv_ind,
+                      uint32_t plpd_id) {
+  rsmi_status_t ret;
+
+  TRY
+  std::ostringstream ss;
+  ss << __PRETTY_FUNCTION__ << " | ======= start =======";
+  LOG_TRACE(ss);
+  REQUIRE_ROOT_ACCESS
+  DEVICE_MUTEX
+  GET_DEV_FROM_INDX
+
+  std::string value("xgmi ");
+  value += std::to_string(plpd_id);
+  int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value);
+  return amd::smi::ErrnoToRsmiStatus(ret);
+
+  CATCH
+}
+
 rsmi_status_t
 rsmi_dev_dpm_policy_get(uint32_t dv_ind,
                      rsmi_dpm_policy_t* policy) {
@@ -2107,7 +2231,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind,

    if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) {
      ss << __PRETTY_FUNCTION__ << " | ======= end ======="
-          << ", Unexpeced pstat data: the id is negative or too many policies.";
+          << ", Unexpected pstat data: the id is negative or too many policies.";
          LOG_ERROR(ss);
          return RSMI_STATUS_UNEXPECTED_DATA;
    }
@@ -2132,7 +2256,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind,

  if (!see_current) {
      ss << __PRETTY_FUNCTION__ << " | ======= end ======="
-          << ", Unexpeced pstat data: cannot find the current policy.";
+          << ", Unexpected pstat data: cannot find the current policy.";
          LOG_ERROR(ss);
          return RSMI_STATUS_UNEXPECTED_DATA;
  }
@@ -536,8 +536,10 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
  {"rsmi_topo_numa_affinity_get",        {{kDevNumaNodeFName}, {}}},
  {"rsmi_dev_gpu_metrics_info_get",      {{kDevGpuMetricsFName}, {}}},
  {"rsmi_dev_pm_metrics_info_get",       {{kDevPmMetricsFName}, {}}},
-  {"rsmi_dev_dpm_policy_get",           {{kDevDPMPolicyFName}, {}}},
-  {"rsmi_dev_dpm_policy_set",           {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_dpm_policy_get",            {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_dpm_policy_set",            {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_xgmi_plpd_get",             {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_xgmi_plpd_set",             {{kDevDPMPolicyFName}, {}}},
  {"rsmi_dev_reg_table_info_get",        {{kDevRegMetricsFName}, {}}},
  {"rsmi_dev_gpu_reset",                 {{kDevGpuResetFName}, {}}},
  {"rsmi_dev_compute_partition_get",     {{kDevComputePartitionFName}, {}}},
@@ -1369,6 +1369,22 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
                    reinterpret_cast<rsmi_dpm_policy_t*>(policy));
 }

+amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                         uint32_t policy) {
+    AMDSMI_CHECK_INIT();
+
+    return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle,
+                    policy);
+}
+
+amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                         amdsmi_dpm_policy_t* policy) {
+    AMDSMI_CHECK_INIT();
+
+    return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle,
+                    reinterpret_cast<rsmi_dpm_policy_t*>(policy));
+}
+
 amdsmi_status_t
 amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle,
                                    uint32_t *num_pages,