Process isolation and clean shader

A few APIs and command line options are added to support process
isolation and clean shader.

Change-Id: I98ad3fc9fc7429799a21798b7fca1c307de7f403
This commit is contained in:
Bill(Shuzhou) Liu
2024-04-08 10:35:24 -05:00
committed by Shuzhou Liu
parent 1ae3a5b6cb
commit 7d2ab7970d
12 changed files with 611 additions and 53 deletions
+8 -5
View File
@@ -148,9 +148,9 @@ Command Modifiers:
```bash
~$ amd-smi static --help
usage: amd-smi static [-h] [-g GPU [GPU ...] | -U CPU [CPU ...]] [-a] [-b] [-V] [-d] [-v]
[-c] [-B] [-r] [-p] [-l] [-u] [-s] [-i] [--json | --csv]
[--file FILE] [--loglevel LEVEL]
usage: amd-smi static [-h] [-g GPU [GPU ...]] [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p]
[-l] [-P] [-x] [-s] [-u] [--json | --csv] [--file FILE]
[--loglevel LEVEL]
If no GPU is specified, returns static information for all GPUs on the system.
If no static argument is provided, all static information will be displayed.
@@ -179,6 +179,7 @@ Static Arguments:
-r, --ras Displays RAS features information
-p, --partition Partition information
-l, --limit All limit metric values (i.e. power and thermal limits)
-s, --process-isolation The process isolation status
-u, --numa All numa node information
CPU Arguments:
@@ -474,13 +475,13 @@ Command Modifiers:
```bash
usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %]
[-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION]
[-o WATTS] [-p POLICY] [--cpu-pwr-limit PWR_LIMIT]
[-o WATTS] [-p POLICY] [-i STATUS] [--cpu-pwr-limit PWR_LIMIT]
[--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH]
[--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE]
[--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE]
[--cpu-df-pstate-range MAX_PSTATE MIN_PSTATE] [--cpu-enable-apb]
[--cpu-disable-apb DF_PSTATE] [--soc-boost-limit BOOST_LIMIT]
[--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE]
[--core-boost-limit BOOST_LIMIT] [-c] [--json | --csv] [--file FILE]
[--loglevel LEVEL]
A GPU must be specified to set a configuration.
@@ -514,6 +515,8 @@ Set Arguments:
-o, --power-cap WATTS Set power capacity limit
-p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id
-x, --xgmi-plpd POLICY_ID Set the GPU XGMI per-link power down policy using policy id
-i, --process-isolation STATUS Enable or disable the GPU process isolation: 0 for disable and 1 for enable.
-c, --clear-sram-data Clear the GPU SRAM data
CPU Arguments:
--cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value.
+89 -37
View File
@@ -245,7 +245,7 @@ class AMDSMICommands():
def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None,
limit=None, driver=None, ras=None, board=None, numa=None, vram=None,
cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None,
policy=None, xgmi_plpd=None):
policy=None, xgmi_plpd=None, process_isolation=None):
"""Get Static information for target gpu
Args:
@@ -270,6 +270,7 @@ class AMDSMICommands():
num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
policy (bool, optional): Value override for args.policy. Defaults to None.
xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None.
Returns:
None: Print output via AMDSMILogger to destination
"""
@@ -306,8 +307,10 @@ class AMDSMICommands():
args.policy = policy
if xgmi_plpd:
args.xgmi_plpd = xgmi_plpd
current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"]
current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd]
if process_isolation:
args.process_isolation = process_isolation
current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd", "process_isolation"]
current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd, args.process_isolation]
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
if numa:
@@ -643,6 +646,16 @@ class AMDSMICommands():
logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['xgmi_plpd'] = policy_info
if 'process_isolation' in current_platform_args:
if args.process_isolation:
try:
status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu)
status = "Enabled" if status else "Disabled"
except amdsmi_exception.AmdSmiLibraryException as e:
status = "N/A"
logging.debug("Failed to process isolation for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['process_isolation'] = status
if 'numa' in current_platform_args:
if args.numa:
try:
@@ -779,7 +792,7 @@ class AMDSMICommands():
bus=None, vbios=None, limit=None, driver=None, ras=None,
board=None, numa=None, vram=None, cache=None, partition=None,
dfc_ucode=None, fb_info=None, num_vf=None, cpu=None,
interface_ver=None, policy=None, xgmi_plpd = None):
interface_ver=None, policy=None, xgmi_plpd = None, process_isolation=None):
"""Get Static information for target gpu and cpu
Args:
@@ -804,6 +817,7 @@ class AMDSMICommands():
interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None
policy (bool, optional): Value override for args.policy. Defaults to None.
xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
@@ -829,7 +843,8 @@ class AMDSMICommands():
gpu_args_enabled = False
gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras",
"board", "numa", "vram", "cache", "partition",
"dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"]
"dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd",
"process_isolation"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
@@ -859,7 +874,8 @@ class AMDSMICommands():
self.static_gpu(args, multiple_devices, gpu, asic,
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf, policy)
dfc_ucode, fb_info, num_vf, policy,
process_isolation)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None:
args.cpu = self.cpu_handles
@@ -873,7 +889,8 @@ class AMDSMICommands():
self.static_gpu(args, multiple_devices, gpu, asic,
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf, policy, xgmi_plpd)
dfc_ucode, fb_info, num_vf, policy, xgmi_plpd,
process_isolation)
def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True):
@@ -3326,7 +3343,8 @@ class AMDSMICommands():
def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
profile=None, perf_determinism=None, compute_partition=None,
memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None):
memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None,
process_isolation=None, clear_sram_data = None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3342,7 +3360,8 @@ class AMDSMICommands():
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
process_isolation (int, optional): Value override for args.process_isolation. Defaults to None.
clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
IndexError: Index error if gpu list is empty
@@ -3371,6 +3390,10 @@ class AMDSMICommands():
args.dpm_policy = dpm_policy
if xgmi_plpd:
args.xgmi_plpd = xgmi_plpd
if process_isolation:
args.process_isolation = process_isolation
if clear_sram_data:
args.clear_sram_data = clear_sram_data
# Handle No GPU passed
if args.gpu == None:
raise ValueError('No GPU provided, specific GPU target(s) are needed')
@@ -3389,9 +3412,11 @@ class AMDSMICommands():
args.compute_partition,
args.memory_partition,
args.perf_determinism is not None,
args.power_cap,
args.dpm_policy,
args.xgmi_plpd]):
args.power_cap is not None,
args.dpm_policy is not None,
args.xgmi_plpd is not None,
args.process_isolation is not None,
args.clear_sram_data]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
@@ -3455,25 +3480,6 @@ class AMDSMICommands():
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}")
if args.dpm_policy:
try:
amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}")
if args.xgmi_plpd:
try:
amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}")
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
@@ -3499,6 +3505,48 @@ class AMDSMICommands():
if min_power_cap == 0:
min_power_cap = 1
self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}")
if isinstance(args.dpm_policy, int):
try:
amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}")
if isinstance(args.xgmi_plpd, int):
try:
amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}")
if isinstance(args.process_isolation, int):
status_string = "Enabled" if args.process_isolation else "Disabled"
result = f"Requested process isolation to {status_string}" # This should not print out
try:
current_status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu)
if current_status == args.process_isolation:
result = f"Process isolation is already {status_string}"
else:
amdsmi_interface.amdsmi_set_gpu_process_isolation(args.gpu, args.process_isolation)
result = f"Successfully set process isolation to {status_string}"
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'process_isolation', result)
if args.clear_sram_data:
try:
# Only 1 can be used for now.
amdsmi_interface.amdsmi_set_gpu_clear_sram_data(args.gpu, 1)
result = 'Successfully clear GPU SRAM data'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to clear SRAM data on GPU {gpu_id}") from e
self.logger.store_output(args.gpu, 'clear_sram_data', result)
if multiple_devices:
self.logger.store_multiple_device_output()
@@ -3513,7 +3561,8 @@ class AMDSMICommands():
cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None,
cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None,
cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None,
soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None):
soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None,
process_isolation=None, clear_sram_data=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3544,7 +3593,8 @@ class AMDSMICommands():
core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None
dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
process_isolation (int, optional): Value override for args.process_isolation. Defaults to None.
clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
IndexError: Index error if gpu list is empty
@@ -3564,7 +3614,8 @@ class AMDSMICommands():
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
"memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"]
"memory_partition", "power_cap", "dpm_policy", "xgmi_plpd", "process_isolation",
"clear_sram_data"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr) is not None:
@@ -3620,7 +3671,8 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, dpm_policy, xgmi_plpd)
memory_partition, power_cap, dpm_policy, xgmi_plpd,
process_isolation, clear_sram_data)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None and args.core == None:
raise ValueError('No CPU or CORE provided, specific target(s) are needed')
@@ -3639,7 +3691,8 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, dpm_policy, xgmi_plpd)
memory_partition, power_cap, dpm_policy, xgmi_plpd,
process_isolation, clear_sram_data)
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
@@ -3660,7 +3713,6 @@ class AMDSMICommands():
compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None.
memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None.
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
IndexError: Index error if gpu list is empty
+8 -2
View File
@@ -545,6 +545,7 @@ class AMDSMIParser(argparse.ArgumentParser):
board_help = "All board information"
dpm_policy_help = "The available DPM policy"
xgmi_plpd_help = "The available XGMI per-link power down policy"
process_isolation_help = "The process isolation status"
# Options arguments help text for Hypervisors and Baremetal
ras_help = "Displays RAS features information"
@@ -586,6 +587,7 @@ class AMDSMIParser(argparse.ArgumentParser):
static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help)
static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help)
static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help)
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
@@ -967,8 +969,9 @@ class AMDSMIParser(argparse.ArgumentParser):
set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}"
set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
set_power_cap_help = "Set power capacity limit"
set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n"
set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n"
set_dpm_policy_help = "Set the GPU DPM policy using policy id\n"
set_xgmi_plpd_help = "Set the GPU XGMI per-link power down policy using policy id\n"
set_process_isolation_help = "Enable or disable the GPU process isolation: 0 for disable and 1 for enable.\n"
# Help text for CPU set options
set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value."
@@ -982,6 +985,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm"
set_cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm. Input parameter is DFPstate (0-3)"
set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is socket BOOST_LIMIT value"
run_gpu_clear_sram_data_help = f"Clear the GPU SRAM data\n"
# Help text for CPU Core set options
set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is core BOOST_LIMIT value"
@@ -1006,6 +1010,8 @@ class AMDSMIParser(argparse.ArgumentParser):
set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')
set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID')
set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID')
set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS')
set_value_parser.add_argument('-c', '--clear-sram-data', action='store_true', required=False, help=run_gpu_clear_sram_data_help)
if self.helpers.is_amd_hsmp_initialized():
# Optional CPU Args
+62
View File
@@ -3455,6 +3455,68 @@ amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
uint32_t plpd_id);
/**
* @brief Get the status of the Process Isolation
*
* @platform{gpu_bm_linux} @platform{guest_1vf}
*
* @details Given a processor handle @p processor_handle, this function will write
* current process isolation status to @p pisolate. The 0 is the process isolation
* disabled, and the 1 is the process isolation enabled.
*
* @param[in] processor_handle a processor handle
*
* @param[in, out] pisolate the process isolation status.
* If this parameter is nullptr, this function will return
* ::AMDSMI_STATUS_INVAL
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle processor_handle,
uint32_t* pisolate);
/**
* @brief Enable/disable the system Process Isolation
*
* @platform{gpu_bm_linux} @platform{guest_1vf}
*
* @details Given a processor handle @p processor_handle and a process isolation @p pisolate,
* flag, this function will set the Process Isolation for this processor. The 0 is the process
* isolation disabled, and the 1 is the process isolation enabled.
*
* @note This function requires root access
*
* @param[in] processor_handle a processor handle
*
* @param[in] pisolate the process isolation status to set.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle processor_handle,
uint32_t pisolate);
/**
* @brief Clear the GPU SRAM data
*
* @platform{gpu_bm_linux} @platform{guest_1vf}
*
* @details Given a processor handle @p processor_handle, and a sclean flag @p sclean,
* this function will clear the SRAM data of this processor. This can be called between
* user logins to prevent information leak.
*
* @note This function requires root access
*
* @param[in] processor_handle a processor handle
*
* @param[in] sclean the clean flag. Only 1 will take effect and other number
* are reserved for future usage.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_set_gpu_clear_sram_data(amdsmi_processor_handle processor_handle,
uint32_t sclean);
/** @} End PerfCont */
/*****************************************************************************/
+161
View File
@@ -1963,6 +1963,98 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_process_isolation
Description: Get the status of the Process Isolation
Input parameters:
* `processor_handle` handle for the given device
Output: integer corresponding to isolation_status; 0 - disabled, 1 - enabled
Exceptions that can be thrown by `amdsmi_get_gpu_process_isolation` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
isolate = amdsmi_get_gpu_process_isolation(device)
print("Process Isolation Status: ", isolate)
except AmdSmiException as e:
print(e)
```
### amdsmi_set_gpu_process_isolation
Description: Enable/disable the system Process Isolation for the given device handle.
Input parameters:
* `processor_handle` handle for the given device
* `pisolate` the process isolation status to set. 0 is the process isolation disabled, and 1 is the process isolation enabled.
Output: None
Exceptions that can be thrown by `amdsmi_set_gpu_process_isolation` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
amdsmi_set_gpu_process_isolation(device, 1)
except AmdSmiException as e:
print(e)
```
### amdsmi_set_gpu_clear_sram_data
Description: Clear the SRAM data of the given device. This can be called between user logins to prevent information leak.
Input parameters:
* `processor_handle` handle for the given device
* `sclean` the clean flag. Only 1 will take effect and other number are reserved for future usage.
Output: None
Exceptions that can be thrown by `amdsmi_set_gpu_clear_sram_data` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
amdsmi_set_gpu_clear_sram_data(device, 1)
except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_overdrive_level
Description: Get the overdrive percent associated with the device with provided
@@ -2602,6 +2694,75 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_get_dpm_policy
Description: Get dpm policy information.
Input parameters:
* `processor_handle` handle for the given device
* `policy_id` the policy id to set.
Output: Dictionary with fields
Field | Description
---|---
`num_supported` | total number of supported policies
`current_id` | current policy id
`policies` | list of dictionaries containing possible policies
Exceptions that can be thrown by `amdsmi_get_dpm_policy` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
dpm_policies = amdsmi_get_dpm_policy(device)
print(dpm_policies)
except AmdSmiException as e:
print(e)
```
### amdsmi_set_dpm_policy
Description: Set the dpm policy to corresponding policy_id. Typically following: 0(default),1,2,3
Input parameters:
* `processor_handle` handle for the given device
* `policy_id` the policy id to set.
Output: None
Exceptions that can be thrown by `amdsmi_set_dpm_policy` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
amdsmi_set_dpm_policy(device, 0)
except AmdSmiException as e:
print(e)
```
### amdsmi_set_xgmi_plpd
Description: Set the xgmi per-link power down policy parameter for the processor
+55 -1
View File
@@ -2734,6 +2734,7 @@ def amdsmi_set_clk_freq(
)
)
def amdsmi_set_dpm_policy(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
policy_id: int,
@@ -2748,6 +2749,7 @@ def amdsmi_set_dpm_policy(
)
)
def amdsmi_set_xgmi_plpd(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
policy_id: int,
@@ -2762,6 +2764,37 @@ def amdsmi_set_xgmi_plpd(
)
)
def amdsmi_set_gpu_process_isolation(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
pisolate: int,
):
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
_check_res(
amdsmi_wrapper.amdsmi_set_gpu_process_isolation(
processor_handle, pisolate
)
)
def amdsmi_set_gpu_clear_sram_data(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
sclean: int,
):
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
_check_res(
amdsmi_wrapper.amdsmi_set_gpu_clear_sram_data(
processor_handle, sclean
)
)
def amdsmi_set_gpu_overdrive_level(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int
):
@@ -2793,6 +2826,7 @@ def amdsmi_get_gpu_bdf_id(processor_handle: amdsmi_wrapper.amdsmi_processor_hand
return bdfid.value
def amdsmi_set_gpu_pci_bandwidth(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle, bitmask: int
) -> None:
@@ -3089,7 +3123,6 @@ def amdsmi_set_gpu_od_volt_info(
)
def amdsmi_get_gpu_fan_rpms(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle, sensor_idx: int
) -> int:
@@ -3320,6 +3353,7 @@ def amdsmi_get_clk_freq(
"frequency": list(freq.frequency)[: freq.num_supported - 1],
}
def amdsmi_get_dpm_policy(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
@@ -3351,6 +3385,7 @@ def amdsmi_get_dpm_policy(
"policies": polices,
}
def amdsmi_get_xgmi_plpd(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
@@ -3382,6 +3417,25 @@ def amdsmi_get_xgmi_plpd(
"plpds": polices,
}
def amdsmi_get_gpu_process_isolation(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> int:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
pisolate = ctypes.c_uint32()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_process_isolation(
processor_handle, ctypes.byref(pisolate)
)
)
return pisolate.value
def amdsmi_get_gpu_od_volt_info(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
+17 -7
View File
@@ -2076,6 +2076,15 @@ amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_
amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd
amdsmi_set_xgmi_plpd.restype = amdsmi_status_t
amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t]
amdsmi_get_gpu_process_isolation = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_isolation
amdsmi_get_gpu_process_isolation.restype = amdsmi_status_t
amdsmi_get_gpu_process_isolation.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32)]
amdsmi_set_gpu_process_isolation = _libraries['libamd_smi.so'].amdsmi_set_gpu_process_isolation
amdsmi_set_gpu_process_isolation.restype = amdsmi_status_t
amdsmi_set_gpu_process_isolation.argtypes = [amdsmi_processor_handle, uint32_t]
amdsmi_set_gpu_clear_sram_data = _libraries['libamd_smi.so'].amdsmi_set_gpu_clear_sram_data
amdsmi_set_gpu_clear_sram_data.restype = amdsmi_status_t
amdsmi_set_gpu_clear_sram_data.argtypes = [amdsmi_processor_handle, uint32_t]
amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version
amdsmi_get_lib_version.restype = amdsmi_status_t
amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)]
@@ -2589,7 +2598,7 @@ __all__ = \
'amdsmi_get_gpu_pci_throughput', 'amdsmi_get_gpu_perf_level',
'amdsmi_get_gpu_pm_metrics_info',
'amdsmi_get_gpu_power_profile_presets',
'amdsmi_get_gpu_process_list',
'amdsmi_get_gpu_process_isolation', 'amdsmi_get_gpu_process_list',
'amdsmi_get_gpu_ras_block_features_enabled',
'amdsmi_get_gpu_ras_feature_info',
'amdsmi_get_gpu_reg_table_info', 'amdsmi_get_gpu_revision',
@@ -2646,18 +2655,19 @@ __all__ = \
'amdsmi_set_cpu_socket_boostlimit',
'amdsmi_set_cpu_socket_lclk_dpm_level',
'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width',
'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clk_range',
'amdsmi_set_gpu_compute_partition',
'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clear_sram_data',
'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition',
'amdsmi_set_gpu_event_notification_mask',
'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition',
'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info',
'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth',
'amdsmi_set_gpu_perf_determinism_mode',
'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile',
'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd',
'amdsmi_shut_down', 'amdsmi_smu_fw_version_t',
'amdsmi_socket_handle', 'amdsmi_status_code_to_string',
'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification',
'amdsmi_set_gpu_process_isolation', 'amdsmi_set_power_cap',
'amdsmi_set_xgmi_plpd', 'amdsmi_shut_down',
'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle',
'amdsmi_status_code_to_string', 'amdsmi_status_t',
'amdsmi_stop_gpu_event_notification',
'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t',
'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type',
'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number',
+56 -1
View File
@@ -3362,7 +3362,7 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind,
*
* @note This function requires root access
*
* @param[in] processor_handle a processor handle
* @param[in] dv_ind a device index
*
* @param[in] policy_id the dpm policy will be modified
*
@@ -3410,6 +3410,61 @@ rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind,
*/
rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind,
uint32_t plpd_id);
/**
* @brief Get the status of the Process Isolation
*
* @details Given a device index @p dv_ind, this function will write
* current process isolation status to @p pisolate. The 0 is the process isolation
* disabled, and the 1 is the process isolation enabled.
*
* @param[in] dv_ind a device index
*
* @param[in, out] pisolate the process isolation status.
* If this parameter is nullptr, this function will return
* ::RSMI_STATUS_INVAL
*
* @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
*/
rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind,
uint32_t* pisolate);
/**
* @brief Enable/disable the system Process Isolation
*
* @details Given a device index @p dv_ind and a process isolation @p pisolate,
* flag, this function will set the Process Isolation for this device. The 0 is the process
* isolation disabled, and the 1 is the process isolation enabled.
*
* @note This function requires root access
*
* @param[in] dv_ind a device index
*
* @param[in] pisolate the process isolation status to set.
*
* @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
*/
rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind,
uint32_t pisolate);
/**
* @brief Clear the GPU SRAM data
*
*
* @details Given a device index @p dv_ind, this function will clear the
* GPU SRAM data of this device. This can be called between user logins to prevent information leak.
*
* @note This function requires root access
*
* @param[in] dv_ind a device index
*
* @param[in] sclean the clean flag. Only 1 will take effect and other number
* are reserved for future usage.
*
* @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
*/
rsmi_status_t rsmi_dev_gpu_clear_sram_data(uint32_t dv_ind, uint32_t sclean);
/** @} */ // end of PerfCont
/*****************************************************************************/
@@ -101,6 +101,8 @@ enum DevKFDNodePropTypes {
enum DevInfoTypes {
kDevPerfLevel,
kDevProcessIsolation,
kDevShaderClean,
kDevOverDriveLevel,
kDevMemOverDriveLevel,
kDevDevID,
@@ -222,6 +224,7 @@ class Device {
void set_drm_render_minor(uint32_t minor) {drm_render_minor_ = minor;}
static rsmi_dev_perf_level perfLvlStrToEnum(std::string s);
uint64_t bdfid(void) const {return bdfid_;}
int get_partition_id() const {return (bdfid_ >> 28) & 0xf; } // location_id[31:28]
void set_bdfid(uint64_t val) {bdfid_ = val;}
pthread_mutex_t *mutex(void) {return mutex_.ptr;}
evt::dev_evt_grp_set_t* supported_event_groups(void) {
+115
View File
@@ -1974,6 +1974,121 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
}
rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind,
uint32_t* pisolate) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start ======= dev_ind:"
<< dv_ind;
LOG_TRACE(ss);
CHK_SUPPORT_NAME_ONLY(pisolate)
// the enforce_isolation sysfs is in this format <partition_id, enable_flag>
// Get the partition_id. For SPX, the partition_id will be 0.
int partition_id = dev->get_partition_id();
DEVICE_MUTEX
std::vector<std::string> val_vec;
rsmi_status_t ret = GetDevValueVec(amd::smi::kDevProcessIsolation, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR "
<< "-> reporting RSMI_STATUS_NOT_SUPPORTED";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS"
<< " -> reporting " << amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
/*
For TPX system where partition0 is enabled, but partition1 and partition2 are disabled,
it will be in this format:
0 1
1 0
2 0
*/
for (uint32_t i = 0; i < val_vec.size(); ++i) {
// Get tokens: <integer> <integer>
auto current_line = amd::smi::trim(val_vec[i]);
std::vector<std::string> tokens;
std::istringstream f(current_line);
std::string s;
while (getline(f, s, ' ')) {
tokens.push_back(s);
}
int cur_part_id = 0;
if (tokens.size() == 2) {
if (amd::smi::stringToInteger(tokens[0], cur_part_id)) {
if (cur_part_id == partition_id) {
int isolate_status = 0;
if (amd::smi::stringToInteger(tokens[1], isolate_status)) {
*pisolate = isolate_status;
return RSMI_STATUS_SUCCESS;
} else {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", the sysfs line " << current_line
<< "should be in <integer> <integer> format";
LOG_ERROR(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
}
}
} // end tokens.size()
} // end for
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", cannot find the partition_id " << partition_id
<<" from sysfs";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_FOUND;
}
rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind,
uint32_t pisolate) {
rsmi_status_t ret;
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
// the enforce_isolation sysfs is in this format <partition_id, enable_flag>
// The smi will always pass partition_id. For SPX, the partition_id will be 0.
int partition_id = dev->get_partition_id();
std::string value = std::to_string(partition_id) + " "+ std::to_string(pisolate);
int ret = dev->writeDevInfo(amd::smi::kDevProcessIsolation , value);
return amd::smi::ErrnoToRsmiStatus(ret);
CATCH
}
rsmi_status_t rsmi_dev_gpu_clear_sram_data(uint32_t dv_ind,
uint32_t sclean) {
rsmi_status_t ret;
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
std::string value = std::to_string(sclean);
int ret = dev->writeDevInfo(amd::smi::kDevShaderClean , value);
return amd::smi::ErrnoToRsmiStatus(ret);
CATCH
}
rsmi_status_t
rsmi_dev_dpm_policy_set(uint32_t dv_ind,
uint32_t policy_id) {
+13
View File
@@ -82,6 +82,8 @@ static const char *kDevPCieVendorIDFName = "vendor";
// Device sysfs file names
static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
static const char *kDevProcessIsolationFName = "enforce_isolation";
static const char *kDevShaderCleanFName = "run_cleaner_shader";
static const char *kDevDevProdNameFName = "product_name";
static const char *kDevDevProdNumFName = "product_number";
static const char *kDevDevIDFName = "device";
@@ -317,6 +319,8 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevGpuMetrics, kDevGpuMetricsFName},
{kDevPmMetrics, kDevPmMetricsFName},
{kDevDPMPolicy, kDevDPMPolicyFName},
{kDevProcessIsolation, kDevProcessIsolationFName},
{kDevShaderClean, kDevShaderCleanFName},
{kDevRegMetrics, kDevRegMetricsFName},
{kDevGpuReset, kDevGpuResetFName},
{kDevAvailableComputePartition, kDevAvailableComputePartitionFName},
@@ -475,6 +479,8 @@ Device::devInfoTypesStrings = {
{kDevMemoryPartition, "kDevMemoryPartition"},
{kDevPCieVendorID, "kDevPCieVendorID"},
{kDevDPMPolicy, "kDevDPMPolicy"},
{kDevProcessIsolation, "kDevProcessIsolation"},
{kDevShaderClean, "kDevShaderClean"},
};
static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
@@ -516,6 +522,9 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_dev_perf_level_set", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_perf_level_set_v1", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_perf_level_get", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_process_isolation_set", {{kDevProcessIsolationFName}, {}}},
{"rsmi_dev_process_isolation_get", {{kDevProcessIsolationFName}, {}}},
{"rsmi_dev_gpu_shader_clean", {{kDevShaderCleanFName}, {}}},
{"rsmi_perf_determinism_mode_set", {{kDevPerfLevelFName,
kDevPowerODVoltageFName}, {}}},
{"rsmi_dev_overdrive_level_set", {{kDevOverDriveLevelFName}, {}}},
@@ -939,6 +948,8 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) {
sysfs_path += kDevAttribNameMap.at(type);
switch (type) {
case kDevGPUMClk:
case kDevProcessIsolation:
case kDevShaderClean:
case kDevDCEFClk:
case kDevFClk:
case kDevGPUSClk:
@@ -1212,6 +1223,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
switch (type) {
case kDevGPUMClk:
case kDevProcessIsolation:
case kDevGPUSClk:
case kDevDCEFClk:
case kDevFClk:
@@ -1279,6 +1291,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
case kDevMemoryPartition:
case kDevNumaNode:
case kDevXGMIPhysicalID:
case kDevProcessIsolation:
return readDevInfoStr(type, val);
break;
+24
View File
@@ -1385,6 +1385,30 @@ amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
reinterpret_cast<rsmi_dpm_policy_t*>(policy));
}
amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle processor_handle,
uint32_t* pisolate) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_process_isolation_get, processor_handle,
pisolate);
}
amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle processor_handle,
uint32_t pisolate) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_process_isolation_set, processor_handle,
pisolate);
}
amdsmi_status_t amdsmi_set_gpu_clear_sram_data(amdsmi_processor_handle processor_handle,
uint32_t sclean) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_gpu_clear_sram_data, processor_handle,
sclean);
}
amdsmi_status_t
amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle,
uint32_t *num_pages,