From 7d2ab7970d77c3031be042f832ac704ade5cade6 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 8 Apr 2024 10:35:24 -0500 Subject: [PATCH] Process isolation and clean shader A few APIs and command line options are added to support process isolation and clean shader. Change-Id: I98ad3fc9fc7429799a21798b7fca1c307de7f403 --- amdsmi_cli/README.md | 13 +- amdsmi_cli/amdsmi_commands.py | 126 ++++++++++----- amdsmi_cli/amdsmi_parser.py | 10 +- include/amd_smi/amdsmi.h | 62 ++++++++ py-interface/README.md | 161 ++++++++++++++++++++ py-interface/amdsmi_interface.py | 56 ++++++- py-interface/amdsmi_wrapper.py | 24 ++- rocm_smi/include/rocm_smi/rocm_smi.h | 57 ++++++- rocm_smi/include/rocm_smi/rocm_smi_device.h | 3 + rocm_smi/src/rocm_smi.cc | 115 ++++++++++++++ rocm_smi/src/rocm_smi_device.cc | 13 ++ src/amd_smi/amd_smi.cc | 24 +++ 12 files changed, 611 insertions(+), 53 deletions(-) diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 06e891475c..27e54e04c2 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -148,9 +148,9 @@ Command Modifiers: ```bash ~$ amd-smi static --help -usage: amd-smi static [-h] [-g GPU [GPU ...] | -U CPU [CPU ...]] [-a] [-b] [-V] [-d] [-v] - [-c] [-B] [-r] [-p] [-l] [-u] [-s] [-i] [--json | --csv] - [--file FILE] [--loglevel LEVEL] +usage: amd-smi static [-h] [-g GPU [GPU ...]] [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] + [-l] [-P] [-x] [-s] [-u] [--json | --csv] [--file FILE] + [--loglevel LEVEL] If no GPU is specified, returns static information for all GPUs on the system. If no static argument is provided, all static information will be displayed. @@ -179,6 +179,7 @@ Static Arguments: -r, --ras Displays RAS features information -p, --partition Partition information -l, --limit All limit metric values (i.e. power and thermal limits) + -s, --process-isolation The process isolation status -u, --numa All numa node information CPU Arguments: @@ -474,13 +475,13 @@ Command Modifiers: ```bash usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION] - [-o WATTS] [-p POLICY] [--cpu-pwr-limit PWR_LIMIT] + [-o WATTS] [-p POLICY] [-i STATUS] [--cpu-pwr-limit PWR_LIMIT] [--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH] [--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE] [--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE] [--cpu-df-pstate-range MAX_PSTATE MIN_PSTATE] [--cpu-enable-apb] [--cpu-disable-apb DF_PSTATE] [--soc-boost-limit BOOST_LIMIT] - [--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE] + [--core-boost-limit BOOST_LIMIT] [-c] [--json | --csv] [--file FILE] [--loglevel LEVEL] A GPU must be specified to set a configuration. @@ -514,6 +515,8 @@ Set Arguments: -o, --power-cap WATTS Set power capacity limit -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id -x, --xgmi-plpd POLICY_ID Set the GPU XGMI per-link power down policy using policy id + -i, --process-isolation STATUS Enable or disable the GPU process isolation: 0 for disable and 1 for enable. + -c, --clear-sram-data Clear the GPU SRAM data CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 17427ff34e..9a4c468686 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -245,7 +245,7 @@ class AMDSMICommands(): def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, - policy=None, xgmi_plpd=None): + policy=None, xgmi_plpd=None, process_isolation=None): """Get Static information for target gpu Args: @@ -270,6 +270,7 @@ class AMDSMICommands(): num_vf (bool, optional): Value override for args.num_vf. Defaults to None. policy (bool, optional): Value override for args.policy. Defaults to None. xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None. Returns: None: Print output via AMDSMILogger to destination """ @@ -306,8 +307,10 @@ class AMDSMICommands(): args.policy = policy if xgmi_plpd: args.xgmi_plpd = xgmi_plpd - current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"] - current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd] + if process_isolation: + args.process_isolation = process_isolation + current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd", "process_isolation"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd, args.process_isolation] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -643,6 +646,16 @@ class AMDSMICommands(): logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['xgmi_plpd'] = policy_info + if 'process_isolation' in current_platform_args: + if args.process_isolation: + try: + status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu) + status = "Enabled" if status else "Disabled" + except amdsmi_exception.AmdSmiLibraryException as e: + status = "N/A" + logging.debug("Failed to process isolation for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['process_isolation'] = status if 'numa' in current_platform_args: if args.numa: try: @@ -779,7 +792,7 @@ class AMDSMICommands(): bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, - interface_ver=None, policy=None, xgmi_plpd = None): + interface_ver=None, policy=None, xgmi_plpd = None, process_isolation=None): """Get Static information for target gpu and cpu Args: @@ -804,6 +817,7 @@ class AMDSMICommands(): interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None policy (bool, optional): Value override for args.policy. Defaults to None. xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -829,7 +843,8 @@ class AMDSMICommands(): gpu_args_enabled = False gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", "board", "numa", "vram", "cache", "partition", - "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"] + "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd", + "process_isolation"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr): @@ -859,7 +874,8 @@ class AMDSMICommands(): self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf, policy) + dfc_ucode, fb_info, num_vf, policy, + process_isolation) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None: args.cpu = self.cpu_handles @@ -873,7 +889,8 @@ class AMDSMICommands(): self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf, policy, xgmi_plpd) + dfc_ucode, fb_info, num_vf, policy, xgmi_plpd, + process_isolation) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -3326,7 +3343,8 @@ class AMDSMICommands(): def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None): + memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None, + process_isolation=None, clear_sram_data = None): """Issue reset commands to target gpu(s) Args: @@ -3342,7 +3360,8 @@ class AMDSMICommands(): power_cap (int, optional): Value override for args.power_cap. Defaults to None. dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. - + process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. + clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None. Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty @@ -3371,6 +3390,10 @@ class AMDSMICommands(): args.dpm_policy = dpm_policy if xgmi_plpd: args.xgmi_plpd = xgmi_plpd + if process_isolation: + args.process_isolation = process_isolation + if clear_sram_data: + args.clear_sram_data = clear_sram_data # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3389,9 +3412,11 @@ class AMDSMICommands(): args.compute_partition, args.memory_partition, args.perf_determinism is not None, - args.power_cap, - args.dpm_policy, - args.xgmi_plpd]): + args.power_cap is not None, + args.dpm_policy is not None, + args.xgmi_plpd is not None, + args.process_isolation is not None, + args.clear_sram_data]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3455,25 +3480,6 @@ class AMDSMICommands(): raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") - - if args.dpm_policy: - try: - amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") - - if args.xgmi_plpd: - try: - amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}") - if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) @@ -3499,6 +3505,48 @@ class AMDSMICommands(): if min_power_cap == 0: min_power_cap = 1 self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}") + if isinstance(args.dpm_policy, int): + try: + amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") + if isinstance(args.xgmi_plpd, int): + try: + amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}") + if isinstance(args.process_isolation, int): + status_string = "Enabled" if args.process_isolation else "Disabled" + result = f"Requested process isolation to {status_string}" # This should not print out + try: + current_status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu) + if current_status == args.process_isolation: + result = f"Process isolation is already {status_string}" + else: + amdsmi_interface.amdsmi_set_gpu_process_isolation(args.gpu, args.process_isolation) + result = f"Successfully set process isolation to {status_string}" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'process_isolation', result) + if args.clear_sram_data: + try: + # Only 1 can be used for now. + amdsmi_interface.amdsmi_set_gpu_clear_sram_data(args.gpu, 1) + result = 'Successfully clear GPU SRAM data' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to clear SRAM data on GPU {gpu_id}") from e + self.logger.store_output(args.gpu, 'clear_sram_data', result) if multiple_devices: self.logger.store_multiple_device_output() @@ -3513,7 +3561,8 @@ class AMDSMICommands(): cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, - soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None): + soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None, + process_isolation=None, clear_sram_data=None): """Issue reset commands to target gpu(s) Args: @@ -3544,7 +3593,8 @@ class AMDSMICommands(): core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. - + process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. + clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None. Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty @@ -3564,7 +3614,8 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"] + "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd", "process_isolation", + "clear_sram_data"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3620,7 +3671,8 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy, xgmi_plpd) + memory_partition, power_cap, dpm_policy, xgmi_plpd, + process_isolation, clear_sram_data) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3639,7 +3691,8 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy, xgmi_plpd) + memory_partition, power_cap, dpm_policy, xgmi_plpd, + process_isolation, clear_sram_data) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, @@ -3660,7 +3713,6 @@ class AMDSMICommands(): compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None. memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None. power_cap (int, optional): Value override for args.power_cap. Defaults to None. - Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index adaa91c34e..f1dae73d29 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -545,6 +545,7 @@ class AMDSMIParser(argparse.ArgumentParser): board_help = "All board information" dpm_policy_help = "The available DPM policy" xgmi_plpd_help = "The available XGMI per-link power down policy" + process_isolation_help = "The process isolation status" # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" @@ -586,6 +587,7 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) + static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -967,8 +969,9 @@ class AMDSMIParser(argparse.ArgumentParser): set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" set_power_cap_help = "Set power capacity limit" - set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n" - set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n" + set_dpm_policy_help = "Set the GPU DPM policy using policy id\n" + set_xgmi_plpd_help = "Set the GPU XGMI per-link power down policy using policy id\n" + set_process_isolation_help = "Enable or disable the GPU process isolation: 0 for disable and 1 for enable.\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -982,6 +985,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm" set_cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm. Input parameter is DFPstate (0-3)" set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is socket BOOST_LIMIT value" + run_gpu_clear_sram_data_help = f"Clear the GPU SRAM data\n" # Help text for CPU Core set options set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is core BOOST_LIMIT value" @@ -1006,6 +1010,8 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID') set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') + set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS') + set_value_parser.add_argument('-c', '--clear-sram-data', action='store_true', required=False, help=run_gpu_clear_sram_data_help) if self.helpers.is_amd_hsmp_initialized(): # Optional CPU Args diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index c5adb70252..2840fb5e62 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -3455,6 +3455,68 @@ amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle, uint32_t plpd_id); + +/** + * @brief Get the status of the Process Isolation + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle, this function will write + * current process isolation status to @p pisolate. The 0 is the process isolation + * disabled, and the 1 is the process isolation enabled. + * + * @param[in] processor_handle a processor handle + * + * @param[in, out] pisolate the process isolation status. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t* pisolate); + +/** + * @brief Enable/disable the system Process Isolation + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle and a process isolation @p pisolate, + * flag, this function will set the Process Isolation for this processor. The 0 is the process + * isolation disabled, and the 1 is the process isolation enabled. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] pisolate the process isolation status to set. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t pisolate); + +/** + * @brief Clear the GPU SRAM data + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle, and a sclean flag @p sclean, + * this function will clear the SRAM data of this processor. This can be called between + * user logins to prevent information leak. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] sclean the clean flag. Only 1 will take effect and other number + * are reserved for future usage. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_gpu_clear_sram_data(amdsmi_processor_handle processor_handle, + uint32_t sclean); + /** @} End PerfCont */ /*****************************************************************************/ diff --git a/py-interface/README.md b/py-interface/README.md index e165eb2860..dae8d0ad1b 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -1963,6 +1963,98 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_process_isolation + +Description: Get the status of the Process Isolation + +Input parameters: + +* `processor_handle` handle for the given device + +Output: integer corresponding to isolation_status; 0 - disabled, 1 - enabled + +Exceptions that can be thrown by `amdsmi_get_gpu_process_isolation` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + isolate = amdsmi_get_gpu_process_isolation(device) + print("Process Isolation Status: ", isolate) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_set_gpu_process_isolation +Description: Enable/disable the system Process Isolation for the given device handle. + +Input parameters: + +* `processor_handle` handle for the given device +* `pisolate` the process isolation status to set. 0 is the process isolation disabled, and 1 is the process isolation enabled. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_gpu_process_isolation` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_gpu_process_isolation(device, 1) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_set_gpu_clear_sram_data +Description: Clear the SRAM data of the given device. This can be called between user logins to prevent information leak. + +Input parameters: + +* `processor_handle` handle for the given device +* `sclean` the clean flag. Only 1 will take effect and other number are reserved for future usage. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_gpu_clear_sram_data` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_gpu_clear_sram_data(device, 1) +except AmdSmiException as e: + print(e) +``` + + ### amdsmi_get_gpu_overdrive_level Description: Get the overdrive percent associated with the device with provided @@ -2602,6 +2694,75 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_dpm_policy + +Description: Get dpm policy information. + +Input parameters: + +* `processor_handle` handle for the given device +* `policy_id` the policy id to set. + +Output: Dictionary with fields + +Field | Description +---|--- +`num_supported` | total number of supported policies +`current_id` | current policy id +`policies` | list of dictionaries containing possible policies + +Exceptions that can be thrown by `amdsmi_get_dpm_policy` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + dpm_policies = amdsmi_get_dpm_policy(device) + print(dpm_policies) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_set_dpm_policy + +Description: Set the dpm policy to corresponding policy_id. Typically following: 0(default),1,2,3 + +Input parameters: + +* `processor_handle` handle for the given device +* `policy_id` the policy id to set. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_dpm_policy` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_dpm_policy(device, 0) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_set_xgmi_plpd Description: Set the xgmi per-link power down policy parameter for the processor diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 61222340bb..696c2be246 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2734,6 +2734,7 @@ def amdsmi_set_clk_freq( ) ) + def amdsmi_set_dpm_policy( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, policy_id: int, @@ -2748,6 +2749,7 @@ def amdsmi_set_dpm_policy( ) ) + def amdsmi_set_xgmi_plpd( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, policy_id: int, @@ -2762,6 +2764,37 @@ def amdsmi_set_xgmi_plpd( ) ) + +def amdsmi_set_gpu_process_isolation( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + pisolate: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_process_isolation( + processor_handle, pisolate + ) + ) + + +def amdsmi_set_gpu_clear_sram_data( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + sclean: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_clear_sram_data( + processor_handle, sclean + ) + ) + + def amdsmi_set_gpu_overdrive_level( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int ): @@ -2793,6 +2826,7 @@ def amdsmi_get_gpu_bdf_id(processor_handle: amdsmi_wrapper.amdsmi_processor_hand return bdfid.value + def amdsmi_set_gpu_pci_bandwidth( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, bitmask: int ) -> None: @@ -3089,7 +3123,6 @@ def amdsmi_set_gpu_od_volt_info( ) - def amdsmi_get_gpu_fan_rpms( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, sensor_idx: int ) -> int: @@ -3320,6 +3353,7 @@ def amdsmi_get_clk_freq( "frequency": list(freq.frequency)[: freq.num_supported - 1], } + def amdsmi_get_dpm_policy( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -3351,6 +3385,7 @@ def amdsmi_get_dpm_policy( "policies": polices, } + def amdsmi_get_xgmi_plpd( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -3382,6 +3417,25 @@ def amdsmi_get_xgmi_plpd( "plpds": polices, } + +def amdsmi_get_gpu_process_isolation( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> int: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + pisolate = ctypes.c_uint32() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_process_isolation( + processor_handle, ctypes.byref(pisolate) + ) + ) + + return pisolate.value + + def amdsmi_get_gpu_od_volt_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 06ae08ce18..03f4a952f7 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -2076,6 +2076,15 @@ amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_ amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd amdsmi_set_xgmi_plpd.restype = amdsmi_status_t amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t] +amdsmi_get_gpu_process_isolation = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_isolation +amdsmi_get_gpu_process_isolation.restype = amdsmi_status_t +amdsmi_get_gpu_process_isolation.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32)] +amdsmi_set_gpu_process_isolation = _libraries['libamd_smi.so'].amdsmi_set_gpu_process_isolation +amdsmi_set_gpu_process_isolation.restype = amdsmi_status_t +amdsmi_set_gpu_process_isolation.argtypes = [amdsmi_processor_handle, uint32_t] +amdsmi_set_gpu_clear_sram_data = _libraries['libamd_smi.so'].amdsmi_set_gpu_clear_sram_data +amdsmi_set_gpu_clear_sram_data.restype = amdsmi_status_t +amdsmi_set_gpu_clear_sram_data.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version amdsmi_get_lib_version.restype = amdsmi_status_t amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)] @@ -2589,7 +2598,7 @@ __all__ = \ 'amdsmi_get_gpu_pci_throughput', 'amdsmi_get_gpu_perf_level', 'amdsmi_get_gpu_pm_metrics_info', 'amdsmi_get_gpu_power_profile_presets', - 'amdsmi_get_gpu_process_list', + 'amdsmi_get_gpu_process_isolation', 'amdsmi_get_gpu_process_list', 'amdsmi_get_gpu_ras_block_features_enabled', 'amdsmi_get_gpu_ras_feature_info', 'amdsmi_get_gpu_reg_table_info', 'amdsmi_get_gpu_revision', @@ -2646,18 +2655,19 @@ __all__ = \ 'amdsmi_set_cpu_socket_boostlimit', 'amdsmi_set_cpu_socket_lclk_dpm_level', 'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width', - 'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clk_range', - 'amdsmi_set_gpu_compute_partition', + 'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clear_sram_data', + 'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition', 'amdsmi_set_gpu_event_notification_mask', 'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition', 'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info', 'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth', 'amdsmi_set_gpu_perf_determinism_mode', 'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile', - 'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd', - 'amdsmi_shut_down', 'amdsmi_smu_fw_version_t', - 'amdsmi_socket_handle', 'amdsmi_status_code_to_string', - 'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification', + 'amdsmi_set_gpu_process_isolation', 'amdsmi_set_power_cap', + 'amdsmi_set_xgmi_plpd', 'amdsmi_shut_down', + 'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle', + 'amdsmi_status_code_to_string', 'amdsmi_status_t', + 'amdsmi_stop_gpu_event_notification', 'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t', 'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type', 'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 3749690067..0fafa31c8f 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3362,7 +3362,7 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, * * @note This function requires root access * - * @param[in] processor_handle a processor handle + * @param[in] dv_ind a device index * * @param[in] policy_id the dpm policy will be modified * @@ -3410,6 +3410,61 @@ rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind, uint32_t plpd_id); + +/** + * @brief Get the status of the Process Isolation + * + * @details Given a device index @p dv_ind, this function will write + * current process isolation status to @p pisolate. The 0 is the process isolation + * disabled, and the 1 is the process isolation enabled. + * + * @param[in] dv_ind a device index + * + * @param[in, out] pisolate the process isolation status. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVAL + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind, + uint32_t* pisolate); + +/** + * @brief Enable/disable the system Process Isolation + * + * @details Given a device index @p dv_ind and a process isolation @p pisolate, + * flag, this function will set the Process Isolation for this device. The 0 is the process + * isolation disabled, and the 1 is the process isolation enabled. + * + * @note This function requires root access + * + * @param[in] dv_ind a device index + * + * @param[in] pisolate the process isolation status to set. + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, + uint32_t pisolate); + +/** + * @brief Clear the GPU SRAM data + * + * + * @details Given a device index @p dv_ind, this function will clear the + * GPU SRAM data of this device. This can be called between user logins to prevent information leak. + * + * @note This function requires root access + * + * @param[in] dv_ind a device index + * + * @param[in] sclean the clean flag. Only 1 will take effect and other number + * are reserved for future usage. + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_gpu_clear_sram_data(uint32_t dv_ind, uint32_t sclean); + /** @} */ // end of PerfCont /*****************************************************************************/ diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index 3df15f2e51..768c736cbc 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -101,6 +101,8 @@ enum DevKFDNodePropTypes { enum DevInfoTypes { kDevPerfLevel, + kDevProcessIsolation, + kDevShaderClean, kDevOverDriveLevel, kDevMemOverDriveLevel, kDevDevID, @@ -222,6 +224,7 @@ class Device { void set_drm_render_minor(uint32_t minor) {drm_render_minor_ = minor;} static rsmi_dev_perf_level perfLvlStrToEnum(std::string s); uint64_t bdfid(void) const {return bdfid_;} + int get_partition_id() const {return (bdfid_ >> 28) & 0xf; } // location_id[31:28] void set_bdfid(uint64_t val) {bdfid_ = val;} pthread_mutex_t *mutex(void) {return mutex_.ptr;} evt::dev_evt_grp_set_t* supported_event_groups(void) { diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 9511a2942f..9318d7359d 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -1974,6 +1974,121 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, } +rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind, + uint32_t* pisolate) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======= dev_ind:" + << dv_ind; + LOG_TRACE(ss); + CHK_SUPPORT_NAME_ONLY(pisolate) + + // the enforce_isolation sysfs is in this format + // Get the partition_id. For SPX, the partition_id will be 0. + int partition_id = dev->get_partition_id(); + + DEVICE_MUTEX + std::vector val_vec; + rsmi_status_t ret = GetDevValueVec(amd::smi::kDevProcessIsolation, dv_ind, &val_vec); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + + /* + For TPX system where partition0 is enabled, but partition1 and partition2 are disabled, + it will be in this format: + 0 1 + 1 0 + 2 0 + */ + + for (uint32_t i = 0; i < val_vec.size(); ++i) { + // Get tokens: + auto current_line = amd::smi::trim(val_vec[i]); + std::vector tokens; + std::istringstream f(current_line); + std::string s; + while (getline(f, s, ' ')) { + tokens.push_back(s); + } + int cur_part_id = 0; + if (tokens.size() == 2) { + if (amd::smi::stringToInteger(tokens[0], cur_part_id)) { + if (cur_part_id == partition_id) { + int isolate_status = 0; + if (amd::smi::stringToInteger(tokens[1], isolate_status)) { + *pisolate = isolate_status; + return RSMI_STATUS_SUCCESS; + } else { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", the sysfs line " << current_line + << "should be in format"; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + } + } + } // end tokens.size() + } // end for + + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", cannot find the partition_id " << partition_id + <<" from sysfs"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_FOUND; +} + +rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, + uint32_t pisolate) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + // the enforce_isolation sysfs is in this format + // The smi will always pass partition_id. For SPX, the partition_id will be 0. + int partition_id = dev->get_partition_id(); + std::string value = std::to_string(partition_id) + " "+ std::to_string(pisolate); + int ret = dev->writeDevInfo(amd::smi::kDevProcessIsolation , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + +rsmi_status_t rsmi_dev_gpu_clear_sram_data(uint32_t dv_ind, + uint32_t sclean) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + std::string value = std::to_string(sclean); + int ret = dev->writeDevInfo(amd::smi::kDevShaderClean , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind, uint32_t policy_id) { diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 92de58c6a1..e9ae71b0fc 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -82,6 +82,8 @@ static const char *kDevPCieVendorIDFName = "vendor"; // Device sysfs file names static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; +static const char *kDevProcessIsolationFName = "enforce_isolation"; +static const char *kDevShaderCleanFName = "run_cleaner_shader"; static const char *kDevDevProdNameFName = "product_name"; static const char *kDevDevProdNumFName = "product_number"; static const char *kDevDevIDFName = "device"; @@ -317,6 +319,8 @@ static const std::map kDevAttribNameMap = { {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevPmMetrics, kDevPmMetricsFName}, {kDevDPMPolicy, kDevDPMPolicyFName}, + {kDevProcessIsolation, kDevProcessIsolationFName}, + {kDevShaderClean, kDevShaderCleanFName}, {kDevRegMetrics, kDevRegMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, @@ -475,6 +479,8 @@ Device::devInfoTypesStrings = { {kDevMemoryPartition, "kDevMemoryPartition"}, {kDevPCieVendorID, "kDevPCieVendorID"}, {kDevDPMPolicy, "kDevDPMPolicy"}, + {kDevProcessIsolation, "kDevProcessIsolation"}, + {kDevShaderClean, "kDevShaderClean"}, }; static const std::map kDevFuncDependsMap = { @@ -516,6 +522,9 @@ static const std::map kDevFuncDependsMap = { {"rsmi_dev_perf_level_set", {{kDevPerfLevelFName}, {}}}, {"rsmi_dev_perf_level_set_v1", {{kDevPerfLevelFName}, {}}}, {"rsmi_dev_perf_level_get", {{kDevPerfLevelFName}, {}}}, + {"rsmi_dev_process_isolation_set", {{kDevProcessIsolationFName}, {}}}, + {"rsmi_dev_process_isolation_get", {{kDevProcessIsolationFName}, {}}}, + {"rsmi_dev_gpu_shader_clean", {{kDevShaderCleanFName}, {}}}, {"rsmi_perf_determinism_mode_set", {{kDevPerfLevelFName, kDevPowerODVoltageFName}, {}}}, {"rsmi_dev_overdrive_level_set", {{kDevOverDriveLevelFName}, {}}}, @@ -939,6 +948,8 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { sysfs_path += kDevAttribNameMap.at(type); switch (type) { case kDevGPUMClk: + case kDevProcessIsolation: + case kDevShaderClean: case kDevDCEFClk: case kDevFClk: case kDevGPUSClk: @@ -1212,6 +1223,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { switch (type) { case kDevGPUMClk: + case kDevProcessIsolation: case kDevGPUSClk: case kDevDCEFClk: case kDevFClk: @@ -1279,6 +1291,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevMemoryPartition: case kDevNumaNode: case kDevXGMIPhysicalID: + case kDevProcessIsolation: return readDevInfoStr(type, val); break; diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 7d375fb312..bb88f578cf 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1385,6 +1385,30 @@ amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, reinterpret_cast(policy)); } +amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t* pisolate) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_process_isolation_get, processor_handle, + pisolate); +} + +amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t pisolate) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_process_isolation_set, processor_handle, + pisolate); +} + +amdsmi_status_t amdsmi_set_gpu_clear_sram_data(amdsmi_processor_handle processor_handle, + uint32_t sclean) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_gpu_clear_sram_data, processor_handle, + sclean); +} + amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages,