From 7d2ab7970d77c3031be042f832ac704ade5cade6 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 8 Apr 2024 10:35:24 -0500 Subject: [PATCH 01/13] Process isolation and clean shader A few APIs and command line options are added to support process isolation and clean shader. Change-Id: I98ad3fc9fc7429799a21798b7fca1c307de7f403 --- amdsmi_cli/README.md | 13 +- amdsmi_cli/amdsmi_commands.py | 126 ++++++++++----- amdsmi_cli/amdsmi_parser.py | 10 +- include/amd_smi/amdsmi.h | 62 ++++++++ py-interface/README.md | 161 ++++++++++++++++++++ py-interface/amdsmi_interface.py | 56 ++++++- py-interface/amdsmi_wrapper.py | 24 ++- rocm_smi/include/rocm_smi/rocm_smi.h | 57 ++++++- rocm_smi/include/rocm_smi/rocm_smi_device.h | 3 + rocm_smi/src/rocm_smi.cc | 115 ++++++++++++++ rocm_smi/src/rocm_smi_device.cc | 13 ++ src/amd_smi/amd_smi.cc | 24 +++ 12 files changed, 611 insertions(+), 53 deletions(-) diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 06e891475c..27e54e04c2 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -148,9 +148,9 @@ Command Modifiers: ```bash ~$ amd-smi static --help -usage: amd-smi static [-h] [-g GPU [GPU ...] | -U CPU [CPU ...]] [-a] [-b] [-V] [-d] [-v] - [-c] [-B] [-r] [-p] [-l] [-u] [-s] [-i] [--json | --csv] - [--file FILE] [--loglevel LEVEL] +usage: amd-smi static [-h] [-g GPU [GPU ...]] [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] + [-l] [-P] [-x] [-s] [-u] [--json | --csv] [--file FILE] + [--loglevel LEVEL] If no GPU is specified, returns static information for all GPUs on the system. If no static argument is provided, all static information will be displayed. @@ -179,6 +179,7 @@ Static Arguments: -r, --ras Displays RAS features information -p, --partition Partition information -l, --limit All limit metric values (i.e. power and thermal limits) + -s, --process-isolation The process isolation status -u, --numa All numa node information CPU Arguments: @@ -474,13 +475,13 @@ Command Modifiers: ```bash usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION] - [-o WATTS] [-p POLICY] [--cpu-pwr-limit PWR_LIMIT] + [-o WATTS] [-p POLICY] [-i STATUS] [--cpu-pwr-limit PWR_LIMIT] [--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH] [--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE] [--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE] [--cpu-df-pstate-range MAX_PSTATE MIN_PSTATE] [--cpu-enable-apb] [--cpu-disable-apb DF_PSTATE] [--soc-boost-limit BOOST_LIMIT] - [--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE] + [--core-boost-limit BOOST_LIMIT] [-c] [--json | --csv] [--file FILE] [--loglevel LEVEL] A GPU must be specified to set a configuration. @@ -514,6 +515,8 @@ Set Arguments: -o, --power-cap WATTS Set power capacity limit -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id -x, --xgmi-plpd POLICY_ID Set the GPU XGMI per-link power down policy using policy id + -i, --process-isolation STATUS Enable or disable the GPU process isolation: 0 for disable and 1 for enable. + -c, --clear-sram-data Clear the GPU SRAM data CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 17427ff34e..9a4c468686 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -245,7 +245,7 @@ class AMDSMICommands(): def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, - policy=None, xgmi_plpd=None): + policy=None, xgmi_plpd=None, process_isolation=None): """Get Static information for target gpu Args: @@ -270,6 +270,7 @@ class AMDSMICommands(): num_vf (bool, optional): Value override for args.num_vf. Defaults to None. policy (bool, optional): Value override for args.policy. Defaults to None. xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None. Returns: None: Print output via AMDSMILogger to destination """ @@ -306,8 +307,10 @@ class AMDSMICommands(): args.policy = policy if xgmi_plpd: args.xgmi_plpd = xgmi_plpd - current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"] - current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd] + if process_isolation: + args.process_isolation = process_isolation + current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd", "process_isolation"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd, args.process_isolation] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -643,6 +646,16 @@ class AMDSMICommands(): logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['xgmi_plpd'] = policy_info + if 'process_isolation' in current_platform_args: + if args.process_isolation: + try: + status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu) + status = "Enabled" if status else "Disabled" + except amdsmi_exception.AmdSmiLibraryException as e: + status = "N/A" + logging.debug("Failed to process isolation for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['process_isolation'] = status if 'numa' in current_platform_args: if args.numa: try: @@ -779,7 +792,7 @@ class AMDSMICommands(): bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, - interface_ver=None, policy=None, xgmi_plpd = None): + interface_ver=None, policy=None, xgmi_plpd = None, process_isolation=None): """Get Static information for target gpu and cpu Args: @@ -804,6 +817,7 @@ class AMDSMICommands(): interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None policy (bool, optional): Value override for args.policy. Defaults to None. xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -829,7 +843,8 @@ class AMDSMICommands(): gpu_args_enabled = False gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", "board", "numa", "vram", "cache", "partition", - "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"] + "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd", + "process_isolation"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr): @@ -859,7 +874,8 @@ class AMDSMICommands(): self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf, policy) + dfc_ucode, fb_info, num_vf, policy, + process_isolation) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None: args.cpu = self.cpu_handles @@ -873,7 +889,8 @@ class AMDSMICommands(): self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf, policy, xgmi_plpd) + dfc_ucode, fb_info, num_vf, policy, xgmi_plpd, + process_isolation) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -3326,7 +3343,8 @@ class AMDSMICommands(): def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None): + memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None, + process_isolation=None, clear_sram_data = None): """Issue reset commands to target gpu(s) Args: @@ -3342,7 +3360,8 @@ class AMDSMICommands(): power_cap (int, optional): Value override for args.power_cap. Defaults to None. dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. - + process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. + clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None. Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty @@ -3371,6 +3390,10 @@ class AMDSMICommands(): args.dpm_policy = dpm_policy if xgmi_plpd: args.xgmi_plpd = xgmi_plpd + if process_isolation: + args.process_isolation = process_isolation + if clear_sram_data: + args.clear_sram_data = clear_sram_data # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3389,9 +3412,11 @@ class AMDSMICommands(): args.compute_partition, args.memory_partition, args.perf_determinism is not None, - args.power_cap, - args.dpm_policy, - args.xgmi_plpd]): + args.power_cap is not None, + args.dpm_policy is not None, + args.xgmi_plpd is not None, + args.process_isolation is not None, + args.clear_sram_data]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3455,25 +3480,6 @@ class AMDSMICommands(): raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") - - if args.dpm_policy: - try: - amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") - - if args.xgmi_plpd: - try: - amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}") - if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) @@ -3499,6 +3505,48 @@ class AMDSMICommands(): if min_power_cap == 0: min_power_cap = 1 self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}") + if isinstance(args.dpm_policy, int): + try: + amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") + if isinstance(args.xgmi_plpd, int): + try: + amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}") + if isinstance(args.process_isolation, int): + status_string = "Enabled" if args.process_isolation else "Disabled" + result = f"Requested process isolation to {status_string}" # This should not print out + try: + current_status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu) + if current_status == args.process_isolation: + result = f"Process isolation is already {status_string}" + else: + amdsmi_interface.amdsmi_set_gpu_process_isolation(args.gpu, args.process_isolation) + result = f"Successfully set process isolation to {status_string}" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'process_isolation', result) + if args.clear_sram_data: + try: + # Only 1 can be used for now. + amdsmi_interface.amdsmi_set_gpu_clear_sram_data(args.gpu, 1) + result = 'Successfully clear GPU SRAM data' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to clear SRAM data on GPU {gpu_id}") from e + self.logger.store_output(args.gpu, 'clear_sram_data', result) if multiple_devices: self.logger.store_multiple_device_output() @@ -3513,7 +3561,8 @@ class AMDSMICommands(): cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, - soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None): + soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None, + process_isolation=None, clear_sram_data=None): """Issue reset commands to target gpu(s) Args: @@ -3544,7 +3593,8 @@ class AMDSMICommands(): core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. - + process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. + clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None. Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty @@ -3564,7 +3614,8 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"] + "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd", "process_isolation", + "clear_sram_data"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3620,7 +3671,8 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy, xgmi_plpd) + memory_partition, power_cap, dpm_policy, xgmi_plpd, + process_isolation, clear_sram_data) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3639,7 +3691,8 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy, xgmi_plpd) + memory_partition, power_cap, dpm_policy, xgmi_plpd, + process_isolation, clear_sram_data) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, @@ -3660,7 +3713,6 @@ class AMDSMICommands(): compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None. memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None. power_cap (int, optional): Value override for args.power_cap. Defaults to None. - Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index adaa91c34e..f1dae73d29 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -545,6 +545,7 @@ class AMDSMIParser(argparse.ArgumentParser): board_help = "All board information" dpm_policy_help = "The available DPM policy" xgmi_plpd_help = "The available XGMI per-link power down policy" + process_isolation_help = "The process isolation status" # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" @@ -586,6 +587,7 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) + static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -967,8 +969,9 @@ class AMDSMIParser(argparse.ArgumentParser): set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" set_power_cap_help = "Set power capacity limit" - set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n" - set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n" + set_dpm_policy_help = "Set the GPU DPM policy using policy id\n" + set_xgmi_plpd_help = "Set the GPU XGMI per-link power down policy using policy id\n" + set_process_isolation_help = "Enable or disable the GPU process isolation: 0 for disable and 1 for enable.\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -982,6 +985,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm" set_cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm. Input parameter is DFPstate (0-3)" set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is socket BOOST_LIMIT value" + run_gpu_clear_sram_data_help = f"Clear the GPU SRAM data\n" # Help text for CPU Core set options set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is core BOOST_LIMIT value" @@ -1006,6 +1010,8 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID') set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') + set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS') + set_value_parser.add_argument('-c', '--clear-sram-data', action='store_true', required=False, help=run_gpu_clear_sram_data_help) if self.helpers.is_amd_hsmp_initialized(): # Optional CPU Args diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index c5adb70252..2840fb5e62 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -3455,6 +3455,68 @@ amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle, uint32_t plpd_id); + +/** + * @brief Get the status of the Process Isolation + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle, this function will write + * current process isolation status to @p pisolate. The 0 is the process isolation + * disabled, and the 1 is the process isolation enabled. + * + * @param[in] processor_handle a processor handle + * + * @param[in, out] pisolate the process isolation status. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t* pisolate); + +/** + * @brief Enable/disable the system Process Isolation + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle and a process isolation @p pisolate, + * flag, this function will set the Process Isolation for this processor. The 0 is the process + * isolation disabled, and the 1 is the process isolation enabled. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] pisolate the process isolation status to set. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t pisolate); + +/** + * @brief Clear the GPU SRAM data + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle, and a sclean flag @p sclean, + * this function will clear the SRAM data of this processor. This can be called between + * user logins to prevent information leak. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] sclean the clean flag. Only 1 will take effect and other number + * are reserved for future usage. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_gpu_clear_sram_data(amdsmi_processor_handle processor_handle, + uint32_t sclean); + /** @} End PerfCont */ /*****************************************************************************/ diff --git a/py-interface/README.md b/py-interface/README.md index e165eb2860..dae8d0ad1b 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -1963,6 +1963,98 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_process_isolation + +Description: Get the status of the Process Isolation + +Input parameters: + +* `processor_handle` handle for the given device + +Output: integer corresponding to isolation_status; 0 - disabled, 1 - enabled + +Exceptions that can be thrown by `amdsmi_get_gpu_process_isolation` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + isolate = amdsmi_get_gpu_process_isolation(device) + print("Process Isolation Status: ", isolate) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_set_gpu_process_isolation +Description: Enable/disable the system Process Isolation for the given device handle. + +Input parameters: + +* `processor_handle` handle for the given device +* `pisolate` the process isolation status to set. 0 is the process isolation disabled, and 1 is the process isolation enabled. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_gpu_process_isolation` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_gpu_process_isolation(device, 1) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_set_gpu_clear_sram_data +Description: Clear the SRAM data of the given device. This can be called between user logins to prevent information leak. + +Input parameters: + +* `processor_handle` handle for the given device +* `sclean` the clean flag. Only 1 will take effect and other number are reserved for future usage. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_gpu_clear_sram_data` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_gpu_clear_sram_data(device, 1) +except AmdSmiException as e: + print(e) +``` + + ### amdsmi_get_gpu_overdrive_level Description: Get the overdrive percent associated with the device with provided @@ -2602,6 +2694,75 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_dpm_policy + +Description: Get dpm policy information. + +Input parameters: + +* `processor_handle` handle for the given device +* `policy_id` the policy id to set. + +Output: Dictionary with fields + +Field | Description +---|--- +`num_supported` | total number of supported policies +`current_id` | current policy id +`policies` | list of dictionaries containing possible policies + +Exceptions that can be thrown by `amdsmi_get_dpm_policy` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + dpm_policies = amdsmi_get_dpm_policy(device) + print(dpm_policies) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_set_dpm_policy + +Description: Set the dpm policy to corresponding policy_id. Typically following: 0(default),1,2,3 + +Input parameters: + +* `processor_handle` handle for the given device +* `policy_id` the policy id to set. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_dpm_policy` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_dpm_policy(device, 0) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_set_xgmi_plpd Description: Set the xgmi per-link power down policy parameter for the processor diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 61222340bb..696c2be246 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2734,6 +2734,7 @@ def amdsmi_set_clk_freq( ) ) + def amdsmi_set_dpm_policy( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, policy_id: int, @@ -2748,6 +2749,7 @@ def amdsmi_set_dpm_policy( ) ) + def amdsmi_set_xgmi_plpd( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, policy_id: int, @@ -2762,6 +2764,37 @@ def amdsmi_set_xgmi_plpd( ) ) + +def amdsmi_set_gpu_process_isolation( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + pisolate: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_process_isolation( + processor_handle, pisolate + ) + ) + + +def amdsmi_set_gpu_clear_sram_data( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + sclean: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_clear_sram_data( + processor_handle, sclean + ) + ) + + def amdsmi_set_gpu_overdrive_level( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int ): @@ -2793,6 +2826,7 @@ def amdsmi_get_gpu_bdf_id(processor_handle: amdsmi_wrapper.amdsmi_processor_hand return bdfid.value + def amdsmi_set_gpu_pci_bandwidth( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, bitmask: int ) -> None: @@ -3089,7 +3123,6 @@ def amdsmi_set_gpu_od_volt_info( ) - def amdsmi_get_gpu_fan_rpms( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, sensor_idx: int ) -> int: @@ -3320,6 +3353,7 @@ def amdsmi_get_clk_freq( "frequency": list(freq.frequency)[: freq.num_supported - 1], } + def amdsmi_get_dpm_policy( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -3351,6 +3385,7 @@ def amdsmi_get_dpm_policy( "policies": polices, } + def amdsmi_get_xgmi_plpd( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -3382,6 +3417,25 @@ def amdsmi_get_xgmi_plpd( "plpds": polices, } + +def amdsmi_get_gpu_process_isolation( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> int: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + pisolate = ctypes.c_uint32() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_process_isolation( + processor_handle, ctypes.byref(pisolate) + ) + ) + + return pisolate.value + + def amdsmi_get_gpu_od_volt_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 06ae08ce18..03f4a952f7 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -2076,6 +2076,15 @@ amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_ amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd amdsmi_set_xgmi_plpd.restype = amdsmi_status_t amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t] +amdsmi_get_gpu_process_isolation = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_isolation +amdsmi_get_gpu_process_isolation.restype = amdsmi_status_t +amdsmi_get_gpu_process_isolation.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32)] +amdsmi_set_gpu_process_isolation = _libraries['libamd_smi.so'].amdsmi_set_gpu_process_isolation +amdsmi_set_gpu_process_isolation.restype = amdsmi_status_t +amdsmi_set_gpu_process_isolation.argtypes = [amdsmi_processor_handle, uint32_t] +amdsmi_set_gpu_clear_sram_data = _libraries['libamd_smi.so'].amdsmi_set_gpu_clear_sram_data +amdsmi_set_gpu_clear_sram_data.restype = amdsmi_status_t +amdsmi_set_gpu_clear_sram_data.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version amdsmi_get_lib_version.restype = amdsmi_status_t amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)] @@ -2589,7 +2598,7 @@ __all__ = \ 'amdsmi_get_gpu_pci_throughput', 'amdsmi_get_gpu_perf_level', 'amdsmi_get_gpu_pm_metrics_info', 'amdsmi_get_gpu_power_profile_presets', - 'amdsmi_get_gpu_process_list', + 'amdsmi_get_gpu_process_isolation', 'amdsmi_get_gpu_process_list', 'amdsmi_get_gpu_ras_block_features_enabled', 'amdsmi_get_gpu_ras_feature_info', 'amdsmi_get_gpu_reg_table_info', 'amdsmi_get_gpu_revision', @@ -2646,18 +2655,19 @@ __all__ = \ 'amdsmi_set_cpu_socket_boostlimit', 'amdsmi_set_cpu_socket_lclk_dpm_level', 'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width', - 'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clk_range', - 'amdsmi_set_gpu_compute_partition', + 'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clear_sram_data', + 'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition', 'amdsmi_set_gpu_event_notification_mask', 'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition', 'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info', 'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth', 'amdsmi_set_gpu_perf_determinism_mode', 'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile', - 'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd', - 'amdsmi_shut_down', 'amdsmi_smu_fw_version_t', - 'amdsmi_socket_handle', 'amdsmi_status_code_to_string', - 'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification', + 'amdsmi_set_gpu_process_isolation', 'amdsmi_set_power_cap', + 'amdsmi_set_xgmi_plpd', 'amdsmi_shut_down', + 'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle', + 'amdsmi_status_code_to_string', 'amdsmi_status_t', + 'amdsmi_stop_gpu_event_notification', 'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t', 'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type', 'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 3749690067..0fafa31c8f 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3362,7 +3362,7 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, * * @note This function requires root access * - * @param[in] processor_handle a processor handle + * @param[in] dv_ind a device index * * @param[in] policy_id the dpm policy will be modified * @@ -3410,6 +3410,61 @@ rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind, uint32_t plpd_id); + +/** + * @brief Get the status of the Process Isolation + * + * @details Given a device index @p dv_ind, this function will write + * current process isolation status to @p pisolate. The 0 is the process isolation + * disabled, and the 1 is the process isolation enabled. + * + * @param[in] dv_ind a device index + * + * @param[in, out] pisolate the process isolation status. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVAL + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind, + uint32_t* pisolate); + +/** + * @brief Enable/disable the system Process Isolation + * + * @details Given a device index @p dv_ind and a process isolation @p pisolate, + * flag, this function will set the Process Isolation for this device. The 0 is the process + * isolation disabled, and the 1 is the process isolation enabled. + * + * @note This function requires root access + * + * @param[in] dv_ind a device index + * + * @param[in] pisolate the process isolation status to set. + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, + uint32_t pisolate); + +/** + * @brief Clear the GPU SRAM data + * + * + * @details Given a device index @p dv_ind, this function will clear the + * GPU SRAM data of this device. This can be called between user logins to prevent information leak. + * + * @note This function requires root access + * + * @param[in] dv_ind a device index + * + * @param[in] sclean the clean flag. Only 1 will take effect and other number + * are reserved for future usage. + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_gpu_clear_sram_data(uint32_t dv_ind, uint32_t sclean); + /** @} */ // end of PerfCont /*****************************************************************************/ diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index 3df15f2e51..768c736cbc 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -101,6 +101,8 @@ enum DevKFDNodePropTypes { enum DevInfoTypes { kDevPerfLevel, + kDevProcessIsolation, + kDevShaderClean, kDevOverDriveLevel, kDevMemOverDriveLevel, kDevDevID, @@ -222,6 +224,7 @@ class Device { void set_drm_render_minor(uint32_t minor) {drm_render_minor_ = minor;} static rsmi_dev_perf_level perfLvlStrToEnum(std::string s); uint64_t bdfid(void) const {return bdfid_;} + int get_partition_id() const {return (bdfid_ >> 28) & 0xf; } // location_id[31:28] void set_bdfid(uint64_t val) {bdfid_ = val;} pthread_mutex_t *mutex(void) {return mutex_.ptr;} evt::dev_evt_grp_set_t* supported_event_groups(void) { diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 9511a2942f..9318d7359d 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -1974,6 +1974,121 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, } +rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind, + uint32_t* pisolate) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======= dev_ind:" + << dv_ind; + LOG_TRACE(ss); + CHK_SUPPORT_NAME_ONLY(pisolate) + + // the enforce_isolation sysfs is in this format + // Get the partition_id. For SPX, the partition_id will be 0. + int partition_id = dev->get_partition_id(); + + DEVICE_MUTEX + std::vector val_vec; + rsmi_status_t ret = GetDevValueVec(amd::smi::kDevProcessIsolation, dv_ind, &val_vec); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + + /* + For TPX system where partition0 is enabled, but partition1 and partition2 are disabled, + it will be in this format: + 0 1 + 1 0 + 2 0 + */ + + for (uint32_t i = 0; i < val_vec.size(); ++i) { + // Get tokens: + auto current_line = amd::smi::trim(val_vec[i]); + std::vector tokens; + std::istringstream f(current_line); + std::string s; + while (getline(f, s, ' ')) { + tokens.push_back(s); + } + int cur_part_id = 0; + if (tokens.size() == 2) { + if (amd::smi::stringToInteger(tokens[0], cur_part_id)) { + if (cur_part_id == partition_id) { + int isolate_status = 0; + if (amd::smi::stringToInteger(tokens[1], isolate_status)) { + *pisolate = isolate_status; + return RSMI_STATUS_SUCCESS; + } else { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", the sysfs line " << current_line + << "should be in format"; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + } + } + } // end tokens.size() + } // end for + + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", cannot find the partition_id " << partition_id + <<" from sysfs"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_FOUND; +} + +rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, + uint32_t pisolate) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + // the enforce_isolation sysfs is in this format + // The smi will always pass partition_id. For SPX, the partition_id will be 0. + int partition_id = dev->get_partition_id(); + std::string value = std::to_string(partition_id) + " "+ std::to_string(pisolate); + int ret = dev->writeDevInfo(amd::smi::kDevProcessIsolation , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + +rsmi_status_t rsmi_dev_gpu_clear_sram_data(uint32_t dv_ind, + uint32_t sclean) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + std::string value = std::to_string(sclean); + int ret = dev->writeDevInfo(amd::smi::kDevShaderClean , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind, uint32_t policy_id) { diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 92de58c6a1..e9ae71b0fc 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -82,6 +82,8 @@ static const char *kDevPCieVendorIDFName = "vendor"; // Device sysfs file names static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; +static const char *kDevProcessIsolationFName = "enforce_isolation"; +static const char *kDevShaderCleanFName = "run_cleaner_shader"; static const char *kDevDevProdNameFName = "product_name"; static const char *kDevDevProdNumFName = "product_number"; static const char *kDevDevIDFName = "device"; @@ -317,6 +319,8 @@ static const std::map kDevAttribNameMap = { {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevPmMetrics, kDevPmMetricsFName}, {kDevDPMPolicy, kDevDPMPolicyFName}, + {kDevProcessIsolation, kDevProcessIsolationFName}, + {kDevShaderClean, kDevShaderCleanFName}, {kDevRegMetrics, kDevRegMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, @@ -475,6 +479,8 @@ Device::devInfoTypesStrings = { {kDevMemoryPartition, "kDevMemoryPartition"}, {kDevPCieVendorID, "kDevPCieVendorID"}, {kDevDPMPolicy, "kDevDPMPolicy"}, + {kDevProcessIsolation, "kDevProcessIsolation"}, + {kDevShaderClean, "kDevShaderClean"}, }; static const std::map kDevFuncDependsMap = { @@ -516,6 +522,9 @@ static const std::map kDevFuncDependsMap = { {"rsmi_dev_perf_level_set", {{kDevPerfLevelFName}, {}}}, {"rsmi_dev_perf_level_set_v1", {{kDevPerfLevelFName}, {}}}, {"rsmi_dev_perf_level_get", {{kDevPerfLevelFName}, {}}}, + {"rsmi_dev_process_isolation_set", {{kDevProcessIsolationFName}, {}}}, + {"rsmi_dev_process_isolation_get", {{kDevProcessIsolationFName}, {}}}, + {"rsmi_dev_gpu_shader_clean", {{kDevShaderCleanFName}, {}}}, {"rsmi_perf_determinism_mode_set", {{kDevPerfLevelFName, kDevPowerODVoltageFName}, {}}}, {"rsmi_dev_overdrive_level_set", {{kDevOverDriveLevelFName}, {}}}, @@ -939,6 +948,8 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { sysfs_path += kDevAttribNameMap.at(type); switch (type) { case kDevGPUMClk: + case kDevProcessIsolation: + case kDevShaderClean: case kDevDCEFClk: case kDevFClk: case kDevGPUSClk: @@ -1212,6 +1223,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { switch (type) { case kDevGPUMClk: + case kDevProcessIsolation: case kDevGPUSClk: case kDevDCEFClk: case kDevFClk: @@ -1279,6 +1291,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevMemoryPartition: case kDevNumaNode: case kDevXGMIPhysicalID: + case kDevProcessIsolation: return readDevInfoStr(type, val); break; diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 7d375fb312..bb88f578cf 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1385,6 +1385,30 @@ amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, reinterpret_cast(policy)); } +amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t* pisolate) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_process_isolation_get, processor_handle, + pisolate); +} + +amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle processor_handle, + uint32_t pisolate) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_process_isolation_set, processor_handle, + pisolate); +} + +amdsmi_status_t amdsmi_set_gpu_clear_sram_data(amdsmi_processor_handle processor_handle, + uint32_t sclean) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_gpu_clear_sram_data, processor_handle, + sclean); +} + amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages, From 614816ab7e4500905b6244903cab414a6d191403 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 24 Apr 2024 05:27:33 -0500 Subject: [PATCH 02/13] Added new functions to py-interface __init__.py Signed-off-by: Maisam Arif Change-Id: I4bd591f834b026793cc9158890e30999cba46e82 --- py-interface/__init__.py | 4 ++++ py-interface/amdsmi_interface.py | 1 + 2 files changed, 5 insertions(+) diff --git a/py-interface/__init__.py b/py-interface/__init__.py index 59d421293a..759c147980 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -134,6 +134,10 @@ from .amdsmi_interface import amdsmi_set_gpu_fan_speed from .amdsmi_interface import amdsmi_reset_gpu_fan from .amdsmi_interface import amdsmi_set_clk_freq from .amdsmi_interface import amdsmi_set_gpu_overdrive_level +from .amdsmi_interface import amdsmi_set_dpm_policy +from .amdsmi_interface import amdsmi_set_xgmi_plpd +from .amdsmi_interface import amdsmi_set_gpu_clear_sram_data +from .amdsmi_interface import amdsmi_set_gpu_process_isolation # # Physical State Queries from .amdsmi_interface import amdsmi_get_gpu_fan_rpms diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 696c2be246..2cff4bf777 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -365,6 +365,7 @@ class AmdSmiProcessorType(IntEnum): NON_AMD_GPU = amdsmi_wrapper.NON_AMD_GPU NON_AMD_CPU = amdsmi_wrapper.NON_AMD_CPU + class AmdSmiEventReader: def __init__( self, processor_handle: amdsmi_wrapper.amdsmi_processor_handle, From aad42d414a7ec9c81f914e9d8f8a1daec30c16e1 Mon Sep 17 00:00:00 2001 From: khashaik Date: Thu, 18 Apr 2024 07:03:56 +0000 Subject: [PATCH 03/13] amd-smi_cli: Fix issue for set core boost limit in CPU Change-Id: I1af4e9d14b1667c5279fcf02cebb4103a92e162c --- amdsmi_cli/amdsmi_commands.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 9a4c468686..7d157ee969 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -3621,7 +3621,6 @@ class AMDSMICommands(): if getattr(args, attr) is not None: gpu_args_enabled = True break - # Check if a CPU argument has been set cpu_args_enabled = False cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode", @@ -3629,7 +3628,7 @@ class AMDSMICommands(): "cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"] for attr in cpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr) not in [None, False]: cpu_args_enabled = True break From a0d0210761458a90dd56da934052a75b05c891dd Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 24 Apr 2024 12:32:39 -0500 Subject: [PATCH 04/13] Process isolation sysfs format change The process isolation sysfs format is changed. This fix will adapt to the new sysfs format. Change-Id: Id6fd7eeb3e25525047dccab248fd9cfb206cbf62 --- rocm_smi/src/rocm_smi.cc | 118 +++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 47 deletions(-) diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 9318d7359d..dd8e903328 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -1987,65 +1988,45 @@ rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind, int partition_id = dev->get_partition_id(); DEVICE_MUTEX - std::vector val_vec; - rsmi_status_t ret = GetDevValueVec(amd::smi::kDevProcessIsolation, dv_ind, &val_vec); + + std::string str_val; + rsmi_status_t ret = get_dev_value_line(amd::smi::kDevProcessIsolation, dv_ind, &str_val); if (ret == RSMI_STATUS_FILE_ERROR) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << ", get_dev_value_str() ret was RSMI_STATUS_FILE_ERROR " << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } if (ret != RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << ", get_dev_value_str() ret was not RSMI_STATUS_SUCCESS" << " -> reporting " << amd::smi::getRSMIStatusString(ret); LOG_ERROR(ss); return ret; } /* - For TPX system where partition0 is enabled, but partition1 and partition2 are disabled, - it will be in this format: - 0 1 - 1 0 - 2 0 + for 4 partition: enforce isolation is enabled on partition 2 and + disabled on partitions 0, 1, 3. + $ cat /sys/class/drm/cardX/device/enforce_isolation + 0 0 1 0 */ - - for (uint32_t i = 0; i < val_vec.size(); ++i) { - // Get tokens: - auto current_line = amd::smi::trim(val_vec[i]); - std::vector tokens; - std::istringstream f(current_line); - std::string s; - while (getline(f, s, ' ')) { - tokens.push_back(s); - } - int cur_part_id = 0; - if (tokens.size() == 2) { - if (amd::smi::stringToInteger(tokens[0], cur_part_id)) { - if (cur_part_id == partition_id) { - int isolate_status = 0; - if (amd::smi::stringToInteger(tokens[1], isolate_status)) { - *pisolate = isolate_status; - return RSMI_STATUS_SUCCESS; - } else { - ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", the sysfs line " << current_line - << "should be in format"; + std::stringstream iss(str_val); + int number; + std::vector partition_status; + while ( iss >> number ) + partition_status.push_back(number); + if (partition_status.size() <= partition_id) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", the sysfs line " << str_val + << " does not have the partition_id " + << partition_id; LOG_ERROR(ss); return RSMI_STATUS_UNEXPECTED_DATA; - } - } - } - } // end tokens.size() - } // end for - - ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", cannot find the partition_id " << partition_id - <<" from sysfs"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_FOUND; + } + *pisolate = partition_status[partition_id]; + return RSMI_STATUS_SUCCESS; } rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, @@ -2060,12 +2041,55 @@ rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, DEVICE_MUTEX GET_DEV_FROM_INDX - // the enforce_isolation sysfs is in this format - // The smi will always pass partition_id. For SPX, the partition_id will be 0. + // To set the values,need to specify the setting for all of the partitions + // For two partition + // echo "1 0" | sudo tee  /sys/class/drm/cardX/device/enforce_isolation int partition_id = dev->get_partition_id(); - std::string value = std::to_string(partition_id) + " "+ std::to_string(pisolate); - int ret = dev->writeDevInfo(amd::smi::kDevProcessIsolation , value); - return amd::smi::ErrnoToRsmiStatus(ret); + std::string str_val; + rsmi_status_t ret = get_dev_value_line(amd::smi::kDevProcessIsolation, dv_ind, &str_val); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", get_dev_value_str() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", get_dev_value_str() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + + // craft the string need to be writeen. + // (1) parse the read enforce_isolation data into a vector + std::stringstream iss(str_val); + int number; + std::vector partition_status; + while ( iss >> number ) { + partition_status.push_back(number); + } + + // (2) Validate the data + if (partition_status.size() <= partition_id) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", the sysfs line " << str_val + << " does not have the partition_id " + << partition_id; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + + // (3) Create the complete list with the update + partition_status[partition_id] = pisolate; + std::stringstream result; + std::copy(partition_status.begin(), partition_status.end(), + std::ostream_iterator(result, " ")); + + std::string value = result.str().c_str(); + int write_ret = dev->writeDevInfo(amd::smi::kDevProcessIsolation , value); + return amd::smi::ErrnoToRsmiStatus(write_ret); CATCH } From 25ef420407a93ea29a6f8444a5e5d95033b55bc9 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 24 Apr 2024 06:11:32 -0500 Subject: [PATCH 05/13] Updated monitor --pcie to use gpu_metrics pcie bandwidth Signed-off-by: Maisam Arif Change-Id: Id37aebc0297317edcd0f459a4817f56a6030d902 --- CHANGELOG.md | 9 +++++++++ amdsmi_cli/amdsmi_commands.py | 36 ++++++----------------------------- amdsmi_cli/amdsmi_parser.py | 4 ++-- 3 files changed, 17 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d09bb5ea9..c0afea2e29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,15 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Changed +- **Updated `amd-smi monitor --pcie` output** +The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: + +```shell +$ amd-smi monitor --pcie +GPU PCIE_BW + 0 26 Mb/s +``` + - **Updated `amd-smi metric --ecc-blocks` output** The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for: diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 7d157ee969..492cba6572 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -4285,38 +4285,14 @@ class AMDSMICommands(): self.logger.table_header += 'VRAM_TOTAL'.rjust(12) if args.pcie: try: - pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) - sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] - received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] - - bw_unit = "Mb/s" - packet_size_unit = "B" - if sent > 0: - sent = sent // 1024 // 1024 - if received > 0: - received = received // 1024 // 1024 - - if self.logger.is_human_readable_format(): - sent = f"{sent} {bw_unit}" - received = f"{received} {bw_unit}" - pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}" - if self.logger.is_json_format(): - sent = {"value" : sent, - "unit" : bw_unit} - received = {"value" : received, - "unit" : bw_unit} - pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'], - "unit" : packet_size_unit} - - monitor_values['pcie_tx'] = sent - monitor_values['pcie_rx'] = received + pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + pcie_bw_unit = 'Mb/s' + monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit) except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['pcie_tx'] = "N/A" - monitor_values['pcie_rx'] = "N/A" - logging.debug("Failed to get pci throughput on gpu %s | %s", gpu_id, e.get_error_info()) + monitor_values['pcie_bw'] = "N/A" + logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'PCIE_TX'.rjust(10) - self.logger.table_header += 'PCIE_RX'.rjust(10) + self.logger.table_header += 'PCIE_BW'.rjust(10) self.logger.store_output(args.gpu, 'values', monitor_values) diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index f1dae73d29..4b11188b03 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -1110,7 +1110,7 @@ class AMDSMIParser(argparse.ArgumentParser): throttle_help = "Monitor thermal throttle status" ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts" mem_usage_help = "Monitor memory usage in MB" - pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s" + pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s" # Create monitor subparser monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help) @@ -1133,7 +1133,7 @@ class AMDSMIParser(argparse.ArgumentParser): monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help) monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help) - monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help) + monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help) def _add_rocm_smi_parser(self, subparsers, func): From e6054be6e7547207df68f4fcfa09176783453655 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Fri, 26 Apr 2024 02:54:25 -0500 Subject: [PATCH 06/13] SWDEV-453493 - Fix Null pointer reference in amd-smi bad-pages Signed-off-by: Maisam Arif Change-Id: I10a1278b68cbb464dd0fb38a2de50413f6f43959 --- CHANGELOG.md | 14 +++++-- amdsmi_cli/amdsmi_commands.py | 35 +++++++++++++----- py-interface/README.md | 51 +++++++++++++++++++++++++- py-interface/amdsmi_interface.py | 63 +++++++++++++++++--------------- 4 files changed, 119 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0afea2e29..970604dd4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Changed +- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. + - **Updated `amd-smi monitor --pcie` output** The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: @@ -332,18 +335,21 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json ### Fixed -- **Fix for GPU reset error on non-amdgpu cards** +- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. + +- **Fix for GPU reset error on non-amdgpu cards** Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix updates CLI to target only AMD ASICs. -- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards** +- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards** Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards). -- **Fix for `amd-smi process`** +- **Fix for `amd-smi process`** Fixed output results when getting processes running on a device. -- **Improved Error handling for `amd-smi process`** +- **Improved Error handling for `amd-smi process`** Fixed Attribute Error when getting process in csv format ### Known issues diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 492cba6572..72fe9f96f1 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1015,14 +1015,19 @@ class AMDSMICommands(): # Get gpu_id for logging gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + bad_pages_not_found = "No bad pages found." try: bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu) + # If bad_page_info is an empty list overwrite with not found error statement + if bad_page_info == []: + bad_page_info = bad_pages_not_found + bad_page_error = True + else: + bad_page_error = False except amdsmi_exception.AmdSmiLibraryException as e: bad_page_info = "N/A" - logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) - - if bad_page_info == "N/A" or bad_page_info == "No bad pages found.": bad_page_error = True + logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) if args.retired: if bad_page_error: @@ -1034,13 +1039,17 @@ class AMDSMICommands(): bad_page_info_entry = {} bad_page_info_entry["page_address"] = bad_page["page_address"] bad_page_info_entry["page_size"] = bad_page["page_size"] - bad_page_info_entry["status"] = bad_page["status"].name + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") bad_page_info_output.append(bad_page_info_entry) # Remove brackets if there is only one value if len(bad_page_info_output) == 1: bad_page_info_output = bad_page_info_output[0] - values_dict['retired'] = bad_page_info_output + if bad_page_info_output == []: + values_dict['retired'] = bad_pages_not_found + else: + values_dict['retired'] = bad_page_info_output if args.pending: if bad_page_error: @@ -1052,13 +1061,17 @@ class AMDSMICommands(): bad_page_info_entry = {} bad_page_info_entry["page_address"] = bad_page["page_address"] bad_page_info_entry["page_size"] = bad_page["page_size"] - bad_page_info_entry["status"] = bad_page["status"].name + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") bad_page_info_output.append(bad_page_info_entry) # Remove brackets if there is only one value if len(bad_page_info_output) == 1: bad_page_info_output = bad_page_info_output[0] - values_dict['pending'] = bad_page_info_output + if bad_page_info_output == []: + values_dict['pending'] = bad_pages_not_found + else: + values_dict['pending'] = bad_page_info_output if args.un_res: if bad_page_error: @@ -1070,13 +1083,17 @@ class AMDSMICommands(): bad_page_info_entry = {} bad_page_info_entry["page_address"] = bad_page["page_address"] bad_page_info_entry["page_size"] = bad_page["page_size"] - bad_page_info_entry["status"] = bad_page["status"].name + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") bad_page_info_output.append(bad_page_info_entry) # Remove brackets if there is only one value if len(bad_page_info_output) == 1: bad_page_info_output = bad_page_info_output[0] - values_dict['un_res'] = bad_page_info_output + if bad_page_info_output == []: + values_dict['un_res'] = bad_pages_not_found + else: + values_dict['un_res'] = bad_page_info_output # Store values in logger.output self.logger.store_output(args.gpu, 'values', values_dict) diff --git a/py-interface/README.md b/py-interface/README.md index dae8d0ad1b..4cc6ecaf0d 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -843,7 +843,7 @@ Input parameters: * `processor_handle` device which to query -Output: List consisting of dictionaries with fields for each bad page found +Output: List consisting of dictionaries with fields for each bad page found; can be an empty list Field | Description ---|--- @@ -868,7 +868,7 @@ try: else: for device in devices: bad_page_info = amdsmi_get_gpu_bad_page_info(device) - if not len(bad_page_info): + if not bad_page_info: # Can be empty list print("No bad pages found") continue for bad_page in bad_page_info: @@ -880,6 +880,53 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_memory_reserved_pages + +Description: Returns reserved memory page info for the given GPU. +It is not supported on virtual machine guest + +Input parameters: + +* `processor_handle` device which to query + +Output: List consisting of dictionaries with fields for each reserved memory page found; can be an empty list + +Field | Description +---|--- +`value` | Value of memory reserved page +`page_address` | Address of memory reserved page +`page_size` | Size of memory reserved page +`status` | Status of memory reserved page + +Exceptions that can be thrown by `amdsmi_get_gpu_memory_reserved_pages` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + reserved_memory_page_info = amdsmi_get_gpu_memory_reserved_pages(device) + if not reserved_memory_page_info: # Can be empty list + print("No memory reserved pages found") + continue + for reserved_memory_page in reserved_memory_page_info: + print(reserved_memory_page["value"]) + print(reserved_memory_page["page_address"]) + print(reserved_memory_page["page_size"]) + print(reserved_memory_page["status"]) +except AmdSmiException as e: + print(e) +``` + + ### amdsmi_get_gpu_process_list Description: Returns the list of processes running on the target GPU; May require root level access diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 2cff4bf777..793ffdec61 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -440,21 +440,22 @@ def _format_bad_page_info(bad_page_info, bad_page_count: ctypes.c_uint32) -> Lis Format bad page info data retrieved. Parameters: - bad_page_info(`POINTER(amdsmi_retired_page_record_t)`): Pointer to bad page info - retrieved. + bad_page_info(`amdsmi_retired_page_record_t`): A populated list of amdsmi_retired_page_record_t(s) + retrieved. Ex: (amdsmi_wrapper.amdsmi_retired_page_record_t * #)() bad_page_count(`c_uint32`): Bad page count. Returns: - `list`: List containing formatted bad pages. + `list`: List containing formatted bad pages. Can be empty """ - if not isinstance( - bad_page_info, ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t) - ): - raise AmdSmiParameterException( - bad_page_info, ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t) - ) + if bad_page_count == 0: + return [] + + # Check if each struct within bad_page_info is valid + for bad_page in bad_page_info: + if not isinstance(bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t): + raise AmdSmiParameterException( + bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t + ) table_records = [] for i in range(bad_page_count.value): @@ -1803,23 +1804,24 @@ def amdsmi_get_gpu_bad_page_info( ) num_pages = ctypes.c_uint32() - retired_page_record = ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t)() - + nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_bad_page_info( - processor_handle, ctypes.byref(num_pages), retired_page_record + processor_handle, ctypes.byref(num_pages), nullptr ) ) - table_records = _format_bad_page_info(retired_page_record, num_pages) - if num_pages.value == 0: - return "No bad pages found." - else: - table_records = _format_bad_page_info(retired_page_record, num_pages) + return [] - return table_records + bad_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages.value)() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_bad_page_info( + processor_handle, ctypes.byref(num_pages), bad_pages + ) + ) + + return _format_bad_page_info(bad_pages, num_pages) def amdsmi_get_gpu_total_ecc_count( @@ -3859,21 +3861,24 @@ def amdsmi_get_gpu_memory_reserved_pages( ) num_pages = ctypes.c_uint32() - retired_page_record = ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t)() + nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages( - processor_handle, ctypes.byref(num_pages), retired_page_record + processor_handle, ctypes.byref(num_pages), nullptr ) ) - table_records = _format_bad_page_info(retired_page_record, num_pages) if num_pages.value == 0: - return "No bad pages found." - else: - table_records = _format_bad_page_info(retired_page_record, num_pages) + return [] - return table_records + mem_reserved_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages)() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages( + processor_handle, ctypes.byref(num_pages), mem_reserved_pages + ) + ) + + return _format_bad_page_info(mem_reserved_pages, num_pages) def amdsmi_get_gpu_metrics_header_info( From 962e217d08f2d9825423e1e5356ddf330f8118dc Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 30 Apr 2024 19:13:46 -0500 Subject: [PATCH 07/13] Updated README example output Signed-off-by: Maisam Arif Change-Id: I45e7ecea022a028501f381fea4291bf78cc4494b --- py-interface/README.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/py-interface/README.md b/py-interface/README.md index 4cc6ecaf0d..6061152f8b 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -3367,13 +3367,8 @@ Example: ```python try: - devices = amdsmi_get_processor_handles() - if len(devices) == 0: - print("No GPUs on machine") - else: - for device in devices: - version = amdsmi_get_lib_version() - print(version) + version = amdsmi_get_lib_version() + print(version) except AmdSmiException as e: print(e) ``` From 1423fb632e340359bd3e4d7e71035112199eb08e Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 2 May 2024 01:23:18 -0500 Subject: [PATCH 08/13] SWDEV-458102 - Deprecated Voltage Curve API Signed-off-by: Maisam Arif Change-Id: I111c3ce26d2ab66d5e755432f4b8a9bfa631f805 --- CHANGELOG.md | 31 +- amdsmi_cli/README.md | 3 +- amdsmi_cli/amdsmi_commands.py | 38 +- amdsmi_cli/amdsmi_parser.py | 2 - include/amd_smi/amdsmi.h | 13 +- py-interface/README.md | 4 +- rocm_smi/include/rocm_smi/rocm_smi.h | 6 +- rocm_smi/include/rocm_smi/rocm_smi_utils.h | 328 +++++++++++++++++- rocm_smi/src/rocm_smi.cc | 161 ++++----- rocm_smi/src/rocm_smi_utils.cc | 9 +- .../functional/mutual_exclusion.cc | 8 +- .../functional/volt_freq_curv_read.cc | 22 +- 12 files changed, 443 insertions(+), 182 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 970604dd4c..eae998bfdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,16 +4,22 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ***All information listed below is for reference and subject to change.*** -## amd_smi_lib for ROCm 6.1.1 +## amd_smi_lib for ROCm 6.1.2 ### Added -- N/A +- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. ### Changed -- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** -Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. +- **Deprecated Volt Curve APIs** +The latest amdgpu driver has dropped support for getting and setting volt curve information. amdsmi_set_gpu_od_volt_info() & amdsmi_get_gpu_od_volt_curve_regions() have been deprecated with amdsmi_get_gpu_od_volt_info() now no longer populating voltage curve frequencies. + +- **Removed `amd-smi metric --voltage-curve` from CLI Tool** +Due to amdgpu driver dropping support for voltage curve, the CLI option has been removed as well. + +### Optimizations - **Updated `amd-smi monitor --pcie` output** The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: @@ -24,6 +30,23 @@ GPU PCIE_BW 0 26 Mb/s ``` +### Fixed + +- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. + +### Known issues + +- None + +## amd_smi_lib for ROCm 6.1.1 + +### Added + +- N/A + +### Changed + - **Updated `amd-smi metric --ecc-blocks` output** The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for: diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 27e54e04c2..a152894415 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -269,7 +269,7 @@ Command Modifiers: ~$ amd-smi metric --help usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-m] [-u] [-p] [-c] [-t] - [-P] [-e] [-k] [-f] [-C] [-o] [-l] [-x] [-E] [--cpu-power-metrics] + [-P] [-e] [-k] [-f] [-o] [-l] [-x] [-E] [--cpu-power-metrics] [--cpu-prochot] [--cpu-freq-metrics] [--cpu-c0-res] [--cpu-lclk-dpm-level NBIOID] [--cpu-pwr-svi-telemtry-rails] [--cpu-io-bandwidth IO_BW LINKID_NAME] @@ -313,7 +313,6 @@ Metric arguments: -e, --ecc Total number of ECC errors -k, --ecc-blocks Number of ECC errors per block -f, --fan Current fan speed - -C, --voltage-curve Display voltage curve -o, --overdrive Current GPU clock overdrive level -l, --perf-level Current DPM performance level -x, --xgmi-err XGMI error information since last read diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 72fe9f96f1..54539ddd66 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1108,7 +1108,7 @@ class AMDSMICommands(): def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, - fan=None, voltage_curve=None, overdrive=None, perf_level=None, + fan=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None,): """Get Metric information for target gpu @@ -1129,7 +1129,6 @@ class AMDSMICommands(): ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. pcie (bool, optional): Value override for args.pcie. Defaults to None. fan (bool, optional): Value override for args.fan. Defaults to None. - voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. overdrive (bool, optional): Value override for args.overdrive. Defaults to None. perf_level (bool, optional): Value override for args.perf_level. Defaults to None. xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. @@ -1188,8 +1187,6 @@ class AMDSMICommands(): if self.helpers.is_baremetal() and self.helpers.is_linux(): if fan: args.fan = fan - if voltage_curve: - args.voltage_curve = voltage_curve if overdrive: args.overdrive = overdrive if perf_level: @@ -1198,8 +1195,8 @@ class AMDSMICommands(): args.xgmi_err = xgmi_err if energy: args.energy = energy - current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"] - current_platform_values += [args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy] + current_platform_args += ["fan", "overdrive", "perf_level", "xgmi_err", "energy"] + current_platform_values += [args.fan, args.overdrive, args.perf_level, args.xgmi_err, args.energy] if self.helpers.is_hypervisor(): if schedule: @@ -1786,26 +1783,6 @@ class AMDSMICommands(): logging.debug("Failed to get fan rpms for gpu %s | %s", args.gpu, e.get_error_info()) values_dict["fan"] = fan_dict - if "voltage_curve" in current_platform_args: - if args.voltage_curve: - try: - od_volt = amdsmi_interface.amdsmi_get_gpu_od_volt_info(args.gpu) - - voltage_point_dict = {} - - for point in range(3): - if isinstance(od_volt, dict): - frequency = int(od_volt["curve.vc_points"][point].frequency / 1000000) - voltage = int(od_volt["curve.vc_points"][point].voltage) - else: - frequency = 0 - voltage = 0 - voltage_point_dict[f'voltage_point_{point}'] = f"{frequency} Mhz {voltage} mV" - - values_dict['voltage_curve'] = voltage_point_dict - except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['voltage_curve'] = "N/A" - logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info()) if "overdrive" in current_platform_args: if args.overdrive: try: @@ -2321,7 +2298,7 @@ class AMDSMICommands(): def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, - fan=None, voltage_curve=None, overdrive=None, perf_level=None, + fan=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None, cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None, @@ -2350,7 +2327,6 @@ class AMDSMICommands(): ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. pcie (bool, optional): Value override for args.pcie. Defaults to None. fan (bool, optional): Value override for args.fan. Defaults to None. - voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. overdrive (bool, optional): Value override for args.overdrive. Defaults to None. perf_level (bool, optional): Value override for args.perf_level. Defaults to None. xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. @@ -2404,7 +2380,7 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock", - "temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve", + "temperature", "ecc", "ecc_blocks", "pcie", "fan", "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule", "guard", "guest_data", "fb_usage", "xgmi"] for attr in gpu_attributes: @@ -2477,7 +2453,7 @@ class AMDSMICommands(): self.metric_gpu(args, multiple_devices, watching_output, gpu, usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, - fan, voltage_curve, overdrive, perf_level, + fan, overdrive, perf_level, xgmi_err, energy, mem_usage, schedule, guard, guest_data, fb_usage, xgmi) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized @@ -2512,7 +2488,7 @@ class AMDSMICommands(): self.metric_gpu(args, multiple_devices, watching_output, gpu, usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, - fan, voltage_curve, overdrive, perf_level, + fan, overdrive, perf_level, xgmi_err, energy, mem_usage, schedule) diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 4b11188b03..af22db7137 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -699,7 +699,6 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Linux Baremetal platforms fan_help = "Current fan speed" - vc_help = "Display voltage curve" overdrive_help = "Current GPU clock overdrive level" perf_level_help = "Current DPM performance level" xgmi_err_help = "XGMI error information since last read" @@ -770,7 +769,6 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args for Linux Baremetal Systems if self.helpers.is_baremetal() and self.helpers.is_linux(): metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) - metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 2840fb5e62..6502cabc11 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -1253,12 +1253,13 @@ typedef struct { typedef struct { amdsmi_range_t curr_sclk_range; //!< The current SCLK frequency range amdsmi_range_t curr_mclk_range; //!< The current MCLK frequency range; - //!< (upper bound only) + //!< (upper bound only) amdsmi_range_t sclk_freq_limits; //!< The range possible of SCLK values amdsmi_range_t mclk_freq_limits; //!< The range possible of MCLK values /** * @brief The current voltage curve + * @deprecated ::Voltage curve support has been deprecated by the driver */ amdsmi_od_volt_curve_t curve; uint32_t num_regions; //!< The number of voltage curve regions @@ -2965,7 +2966,7 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle); /** - * @brief This function retrieves the voltage/frequency curve information. It is + * @brief This function retrieves the overdrive GFX & MCLK information. It is * not supported on virtual machine guest * * @platform{gpu_bm_linux} @@ -3166,6 +3167,9 @@ amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_han * * @platform{gpu_bm_linux} * + * @deprecated ::Voltage curve information is no longer supported by the + * amdgpu driver; this includes the ability to set voltage curve regions + * * @details Given a processor handle @p processor_handle, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -3192,6 +3196,9 @@ amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_ha * * @platform{gpu_bm_linux} * + * @deprecated ::Voltage curve information is no longer supported by the + * amdgpu driver; this includes the number of valid voltage regions + * * @details Given a processor handle @p processor_handle, a pointer to an unsigned integer * @p num_regions and a buffer of ::amdsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current @@ -3502,7 +3509,7 @@ amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle process * @platform{gpu_bm_linux} @platform{guest_1vf} * * @details Given a processor handle @p processor_handle, and a sclean flag @p sclean, - * this function will clear the SRAM data of this processor. This can be called between + * this function will clear the SRAM data of this processor. This can be called between * user logins to prevent information leak. * * @note This function requires root access diff --git a/py-interface/README.md b/py-interface/README.md index 6061152f8b..7d1f414565 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -1591,7 +1591,7 @@ except AmdSmiException as e: ### amdsmi_set_gpu_od_clk_info -Description: This function sets the clock frequency information +Description: **deprecated** This function sets the clock frequency information It is not supported on virtual machine guest Input parameters: @@ -2306,7 +2306,7 @@ except AmdSmiException as e: ### amdsmi_get_gpu_od_volt_curve_regions -Description: This function will retrieve the current valid regions in the +Description: **deprecated** This function will retrieve the current valid regions in the frequency/voltage space. It is not supported on virtual machine guest Input parameters: diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 0fafa31c8f..8797cf1b5f 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3058,6 +3058,8 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, /** * @brief This function sets 1 of the 3 voltage curve points. * + * @deprecated This function is deprecated due to driver changes. + * * @details Given a device index @p dv_ind, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -3083,6 +3085,8 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, * @brief This function will retrieve the current valid regions in the * frequency/voltage space. * + * @deprecated This function is deprecated due to driver changes. + * * @details Given a device index @p dv_ind, a pointer to an unsigned integer * @p num_regions and a buffer of ::rsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current @@ -3448,7 +3452,7 @@ rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, uint32_t pisolate); /** - * @brief Clear the GPU SRAM data + * @brief Clear the GPU SRAM data * * * @details Given a device index @p dv_ind, this function will clear the diff --git a/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/rocm_smi/include/rocm_smi/rocm_smi_utils.h index 67d9d8b8d8..32e6bdeefc 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -45,14 +45,17 @@ #include -#include +#include #include -#include -#include #include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -125,13 +128,33 @@ std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions, bool is_sudo_user(); rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, std::string *gfx_version); + +std::string leftTrim(const std::string &s); +std::string rightTrim(const std::string &s); +std::string trim(const std::string &s); +std::string removeNewLines(const std::string &s); + +std::string removeString(const std::string origStr, + const std::string &removeMe); template - std::string print_int_as_hex(T i, bool showHexNotation = true) { + std::string print_int_as_hex(T i, bool showHexNotation = true, + int overloadBitSize = 0) { std::stringstream ss; if (showHexNotation) { - ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + if (overloadBitSize == 0) { + ss << "0x" << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0'); + } else { + // 8 bits per 1 byte + int byteSize = (overloadBitSize / 8) * 2; + ss << "0x" << std::hex << std::setw(byteSize) << std::setfill('0'); + } } else { - ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + if (overloadBitSize == 0) { + ss << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0'); + } else { + int byteSize = (overloadBitSize / 8) * 2; + ss << std::hex << std::setw(byteSize) << std::setfill('0'); + } } if (std::is_same::value) { @@ -162,7 +185,8 @@ std::string print_unsigned_hex_and_int(T i, std::string heading="") { } ss << "Hex (MSB): " << print_int_as_hex(i) << ", " << "Unsigned int: " << print_unsigned_int(i) << ", " - << "Byte Size: " << sizeof(T); + << "Byte Size: " << sizeof(T) << ", " + << "Bits: " << sizeof(T) * 8; // 8 bits per 1 byte return ss.str(); } @@ -283,8 +307,290 @@ class ScopedAcquire { // In VM environment, the /proc/cpuinfo set hypervisor flag by default bool is_vm_guest(); -// trim a string -std::string trim(const std::string &s); + +// +enum class TagSplitterPositional_t +{ + kFIRST, + kBETWEEN, + kLAST, + kNONE, +}; + +template +class TagTextContents_t +{ + public: + using TextLines_t = std::vector; + using PrimaryList_t = std::vector; + using SecondaryList_t = std::vector; + using PrimaryKeyTbl_t = std::map; + using SecondaryKeyTbl_t = std::map; + using StructuredKeysTbl_t = std::map>; + + // + TagTextContents_t() = default; + TagTextContents_t(const TagTextContents_t&) = delete; + TagTextContents_t(TagTextContents_t&&) = delete; + TagTextContents_t& operator=(const TagTextContents_t&) = delete; + TagTextContents_t& operator=(TagTextContents_t&&) = delete; + + explicit TagTextContents_t(const TextLines_t& text_content) + : m_text_content(text_content) {} + + TagTextContents_t& set_text_content(const TextLines_t& text_content) + { + m_text_content = text_content; + } + + TagTextContents_t& set_title_terminator(const std::string& title_mark, + TagSplitterPositional_t title_mark_position) { + m_title_mark = title_mark; + m_title_mark_position = title_mark_position; + + return *this; + } + + TagTextContents_t& set_key_data_splitter(const std::string& line_splitter_mark, + TagSplitterPositional_t line_mark_position) { + m_line_splitter_mark = line_splitter_mark; + m_line_mark_position = line_mark_position; + + return *this; + } + + TagTextContents_t& structure_content() { + // Sanitizes the content. + if (!m_text_content.empty()) { + std::for_each(m_text_content.begin(), m_text_content.end(), trim); + section_title_lookup(); + section_data_lookup(); + } + + return *this; + } + + decltype(auto) get_title_size() { + return m_primary.size(); + } + + decltype(auto) get_structured_subkeys_size(const PrimaryKeyType& prim_key) { + return m_structured[prim_key].size(); + } + + decltype(auto) contains_title_key(const PrimaryKeyType& key) { + return (m_primary.find(key) != m_primary.end()); + } + + decltype(auto) contains_structured_key(const PrimaryKeyType& prim_key, + const SecondaryKeyType& sec_key) { + if (auto first_key_itr = m_structured.find(prim_key); + first_key_itr != m_structured.end()) { + if (auto sec_key_itr = first_key_itr->second.find(sec_key); + sec_key_itr != first_key_itr->second.end()) { + return true; + } + } + + return false; + } + + decltype(auto) get_structured_value_by_keys(const PrimaryKeyType& prim_key, + const SecondaryKeyType& sec_key, + bool is_value_id = true) { + if (auto first_key_itr = m_structured.find(prim_key); + first_key_itr != m_structured.end()) { + if (auto sec_key_itr = first_key_itr->second.find(sec_key); + sec_key_itr != first_key_itr->second.end()) { + SecondaryDataType key_value{}; + if (is_value_id) { + key_value = SecondaryDataType(sec_key_itr->first) + " "; + } + key_value += sec_key_itr->second; + return key_value; + } + } + + return SecondaryDataType{}; + } + + decltype(auto) get_structured_data_subkey_by_position(const PrimaryKeyType& prim_key, + uint32_t key_position) { + auto key_counter = uint32_t(0); + SecondaryKeyType data_key{}; + if (key_position < (get_structured_subkeys_size(prim_key))) { + for (const auto& [sec_key, sec_value] : m_structured[prim_key]) { + if (key_counter == key_position) { + data_key = static_cast(sec_key); + return data_key; + } + ++key_counter; + } + } + + return data_key; + } + + decltype(auto) get_structured_data_subkey_first(const PrimaryKeyType& prim_key) { + return (get_structured_value_by_keys(prim_key, + get_structured_data_subkey_by_position(prim_key, 0))); + } + + decltype(auto) get_structured_data_subkey_last(const PrimaryKeyType& prim_key) { + return (get_structured_value_by_keys(prim_key, get_structured_data_subkey_by_position(prim_key, + (get_structured_subkeys_size(prim_key) - 1)))); + } + + void reset() { + m_text_content.clear(); + m_primary.clear(); + m_structured.clear(); + m_title_mark.clear(); + m_line_splitter_mark.clear(); + m_title_mark_position = TagSplitterPositional_t::kNONE; + m_line_mark_position = TagSplitterPositional_t::kNONE; + } + + decltype(auto) dump_structured_content() { + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; + ostrstream << "** Primary Table **" << "\n"; + for (const auto& [key, values] : m_primary) { + ostrstream << "key: " << key << " values: " << values.size() << "\n"; + for (const auto& value : values) { + ostrstream << "\t value: " << value << "\n"; + } + } + + ostrstream << "\n ** Structured Table **" << "\n"; + for (const auto& [prim_key, prim_values] : m_structured) { + ostrstream << "key: " << prim_key << "\n"; + for (const auto& [sec_key, sec_value] : prim_values) { + ostrstream << "\t key: " << sec_key << " -> " << sec_value << "\n"; + } + } + ostrstream << "\n\n"; + + return ostrstream.str(); + } + + + private: + TextLines_t m_text_content; + PrimaryKeyTbl_t m_primary; + StructuredKeysTbl_t m_structured; + std::string m_title_mark; + std::string m_line_splitter_mark; + TagSplitterPositional_t m_title_mark_position; + TagSplitterPositional_t m_line_mark_position; + + // + // Note: Organizes table with Title as a Key, and a list of values. + // + decltype(auto) section_title_lookup() { + if (m_title_mark.empty() || + m_title_mark_position == TagSplitterPositional_t::kNONE) { + return; + } + + // + // Note: + // - top_title_line: Left pointer for the sliding window + // - bottom_title_line: Right pointer for the sliding window + // + auto top_title_line = uint32_t(std::numeric_limits::max()); + auto bottom_title_line = uint32_t(std::numeric_limits::max()); + auto line_counter = uint32_t(0); + + // + // Note: This whole interval/window where the section/title starts, and where it ends. + // + auto update_primary_tbl = [&](const uint32_t& from_line, const uint32_t& to_line) { + auto key = static_cast(m_text_content[from_line]); + for (auto line_num(from_line + 1); line_num < to_line; ++line_num) { + if ((line_num < m_text_content.size()) && !m_text_content[line_num].empty()) { + m_primary[key].push_back(m_text_content[line_num]); + } + } + }; + + auto adjust_sliding_window = [&](const uint32_t& title_line) { + // First time top_title_line gets adjusted. + if (top_title_line == uint32_t(std::numeric_limits::max())) { + top_title_line = title_line; + bottom_title_line = top_title_line; + return; + } + if (title_line > bottom_title_line) { + bottom_title_line = title_line; + update_primary_tbl(top_title_line, bottom_title_line); + top_title_line = bottom_title_line; + } + }; + + for (const auto& line : m_text_content) { + auto was_title_found{false}; + switch (m_title_mark_position) { + case TagSplitterPositional_t::kFIRST: + // Section/Title Mark was found at the first position + if (line.find_first_of(m_title_mark.c_str()) == 0) { + was_title_found = true; + } + break; + + case TagSplitterPositional_t::kLAST: + // Section/Title Mark was found at the last position + if ((line.find_last_of(m_title_mark.c_str()) + 1) == line.size()) { + was_title_found = true; + } + break; + + default: + break; + } + + if (was_title_found) { + adjust_sliding_window(line_counter); + } + ++line_counter; + } + + // Any remaining elements? + if (line_counter > bottom_title_line) { + update_primary_tbl(bottom_title_line, (line_counter - 1)); + } + } + + decltype(auto) section_data_lookup() { + if (m_line_splitter_mark.empty() || + m_line_mark_position == TagSplitterPositional_t::kNONE) { + return; + } + + // + // Note: Organizes table with Title as a Key, a Key/ID for values and values. + // It takes into consideration the initial constraints were all good and + // that the primary table has been populated. + for (const auto& [prim_key, prim_values] : m_primary) { + for (const auto& value : prim_values) { + if (auto mark_pos = value.find_first_of(m_line_splitter_mark.c_str()); + mark_pos != std::string::npos) { + auto sec_key = trim(value.substr(0, mark_pos + 1)); + auto sec_data = trim(value.substr((mark_pos + 1), value.size())); + if (!sec_key.empty()) { + m_structured[prim_key].insert(std::make_pair(sec_key, sec_data)); + } + } + } + } + } + +}; + +using TextFileTagContents_t = TagTextContents_t; + } // namespace smi } // namespace amd diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index dd8e903328..aa5f30d9d1 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -1415,17 +1415,6 @@ For the new format, GFXCLK field will show min and max values(0/1). If the curre frequency in neither min/max but lies within the range, this is indicated by an additional value followed by * at index 1 and max value at index 2. */ -constexpr uint32_t kOD_SCLK_label_array_index = 0; -constexpr uint32_t kOD_MCLK_label_array_index = - kOD_SCLK_label_array_index + 3; -constexpr uint32_t kOD_VDDC_CURVE_label_array_index = - kOD_MCLK_label_array_index + 2; -constexpr uint32_t kOD_OD_RANGE_label_array_index = - kOD_VDDC_CURVE_label_array_index + 4; -constexpr uint32_t kOD_VDDC_CURVE_start_index = - kOD_OD_RANGE_label_array_index + 3; -// constexpr uint32_t kOD_VDDC_CURVE_num_lines = -// kOD_VDDC_CURVE_start_index + 4; constexpr uint32_t kMIN_VALID_LINES = 2; static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, @@ -1450,62 +1439,75 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - assert(val_vec[kOD_SCLK_label_array_index] == "OD_SCLK:" || - val_vec[kOD_SCLK_label_array_index] == "GFXCLK:"); - if ((val_vec[kOD_SCLK_label_array_index] != "OD_SCLK:") && - (val_vec[kOD_SCLK_label_array_index] != "GFXCLK:")) { - return RSMI_STATUS_UNEXPECTED_DATA; + // + const std::string kTAG_OD_SCLK{"OD_SCLK:"}; + const std::string kTAG_GFXCLK{"GFXCLK:"}; + const std::string KTAG_OD_MCLK{"OD_MCLK:"}; + const std::string KTAG_MCLK{"MCLK:"}; + const std::string KTAG_FIRST_FREQ_IDX{"0:"}; + amd::smi::TextFileTagContents_t txt_power_dev_od_voltage(val_vec); + txt_power_dev_od_voltage + .set_title_terminator(":", amd::smi::TagSplitterPositional_t::kLAST) + .set_key_data_splitter(":", amd::smi::TagSplitterPositional_t::kBETWEEN) + .structure_content(); + + // + // Note: We must have minimum of 'GFXCLK:' && 'MCLK:' OR: + // 'OD_SCLK:' && 'OD_MCLK:' tags. + if (txt_power_dev_od_voltage.get_title_size() < kMIN_VALID_LINES) { + return rsmi_status_t::RSMI_STATUS_NO_DATA; } - - // find last_item but skip empty lines - int last_item = val_vec.size()-1; - while (val_vec[last_item].empty() || val_vec[last_item][0] == 0) - last_item--; - - p->curr_sclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_SCLK_label_array_index + 1); - p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_SCLK_label_array_index + 2); - - if (val_vec.size() < (kOD_MCLK_label_array_index + 1)) { - return RSMI_STATUS_UNEXPECTED_SIZE; - } - // The condition below checks if it is the old style or new style format. - if (val_vec[kOD_MCLK_label_array_index] == "OD_MCLK:") { - p->curr_mclk_range.lower_bound = 0; - p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_MCLK_label_array_index + 1); - } else if (val_vec[kOD_MCLK_label_array_index] == "MCLK:") { - p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_MCLK_label_array_index + 1); - // the upper memory frequency is the last - p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, last_item); - return RSMI_STATUS_SUCCESS; - } else { - if (val_vec.size() < (kOD_MCLK_label_array_index + 3)) { - return RSMI_STATUS_UNEXPECTED_SIZE; - } - if (val_vec[kOD_MCLK_label_array_index + 1] == "MCLK:") { - p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_SCLK_label_array_index + 3); - p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_MCLK_label_array_index + 2); - // the upper memory frequency is the last - p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, last_item); - return RSMI_STATUS_SUCCESS; - } - return RSMI_STATUS_NOT_YET_IMPLEMENTED; + // Note: For debug builds/purposes only. + assert(txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) || + txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)); + // Note: For release builds/purposes. + if (!txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) && + !txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)) { + return rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; } - if (val_vec.size() < kOD_VDDC_CURVE_label_array_index) { - return RSMI_STATUS_UNEXPECTED_SIZE; - } + // Note: Quick helpers for getting 1st and last elements found + auto build_lower_bound = [&](const std::string& prim_key) { + auto lower_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_first(prim_key); + return std::vector{lower_bound_data}; + }; - p->num_regions = - static_cast((val_vec.size()) / 2); + auto build_upper_bound = [&](const std::string& prim_key) { + auto upper_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_last(prim_key); + return std::vector{upper_bound_data}; + }; + + // Validates 'OD_SCLK' is in the structure + if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); + p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); + + // Validates 'OD_MCLK' is in the structure + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); + p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); + } + } + // Validates 'GFXCLK' is in the structure + else if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0); + p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0); + + // Validates 'MCLK' is in the structure + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0); + p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0); + } + } + else { + return RSMI_STATUS_NOT_YET_IMPLEMENTED; + } + p->num_regions = 0; return RSMI_STATUS_SUCCESS; CATCH @@ -1674,30 +1676,6 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, } -static void get_vc_region(uint32_t start_ind, - std::vector *val_vec, rsmi_freq_volt_region_t *p) { - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << " | ======= start ======="; - LOG_TRACE(ss); - assert(p != nullptr); - assert(val_vec != nullptr); - THROW_IF_NULLPTR_DEREF(p) - THROW_IF_NULLPTR_DEREF(val_vec) - - // There must be at least 1 region to read in - assert(val_vec->size() >= kOD_OD_RANGE_label_array_index + 2); - assert((*val_vec)[kOD_OD_RANGE_label_array_index] == "OD_RANGE:"); - if ((val_vec->size() < kOD_OD_RANGE_label_array_index + 2) || - ((*val_vec)[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") ) { - ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " - << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA); - LOG_TRACE(ss); - throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__); - } - od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); - od_value_pair_str_to_range((*val_vec)[start_ind + 1], &p->volt_range); -} - /* * num_regions [inout] on calling, the number of regions requested to be read * in. At completion, the number of regions actually read in @@ -1729,23 +1707,20 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, // This is a work-around to handle systems where kDevPowerODVoltage is not // fully supported yet. - if (val_vec.size() < 2) { + if (val_vec.size() < kMIN_VALID_LINES) { ss << __PRETTY_FUNCTION__ - << " | Issue: val_vec.size() < 2" << "; returning " + << " | Issue: val_vec.size() < " << kMIN_VALID_LINES << "; returning " << getRSMIStatusString(RSMI_STATUS_NOT_YET_IMPLEMENTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_YET_IMPLEMENTED; } uint32_t val_vec_size = static_cast(val_vec.size()); - assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0); - ss << __PRETTY_FUNCTION__ << " | val_vec_size = " << std::dec - << val_vec_size - << " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index; + << val_vec_size; LOG_DEBUG(ss); - *num_regions = std::min((val_vec_size) / 2, *num_regions); + *num_regions = 0; return RSMI_STATUS_SUCCESS; CATCH diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index 61ec4243dc..45dd3fe40f 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -1134,14 +1134,6 @@ std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) { ss << pt_rng_Mhz("\t**Current SCLK frequency range: ", &odv->curr_sclk_range); ss << pt_rng_Mhz("\t**Current MCLK frequency range: ", &odv->curr_mclk_range); - ss << pt_rng_Mhz("\t**Min/Max Possible SCLK frequency range: ", - &odv->sclk_freq_limits); - ss << pt_rng_Mhz("\t**Min/Max Possible MCLK frequency range: ", - &odv->mclk_freq_limits); - - ss << "\t**Current Freq/Volt. curve: " << "\n"; - ss << pt_vddc_curve(&odv->curve); - ss << "\t**Number of Freq./Volt. regions: " << odv->num_regions << "\n\n"; return ss.str(); } @@ -1224,5 +1216,6 @@ std::queue getAllDeviceGfxVers() { return deviceGfxVersions; } + } // namespace smi } // namespace amd diff --git a/tests/amd_smi_test/functional/mutual_exclusion.cc b/tests/amd_smi_test/functional/mutual_exclusion.cc index 48bbe82934..e5578619f1 100755 --- a/tests/amd_smi_test/functional/mutual_exclusion.cc +++ b/tests/amd_smi_test/functional/mutual_exclusion.cc @@ -183,10 +183,10 @@ void TestMutualExclusion::Run(void) { int64_t dmy_i64 = 0; char dmy_str[10]; amdsmi_dev_perf_level_t dmy_perf_lvl; - amdsmi_frequencies_t dmy_freqs; - amdsmi_od_volt_freq_data_t dmy_od_volt; - amdsmi_freq_volt_region_t dmy_vlt_reg; - amdsmi_error_count_t dmy_err_cnt; + amdsmi_frequencies_t dmy_freqs{}; + amdsmi_od_volt_freq_data_t dmy_od_volt{}; + amdsmi_freq_volt_region_t dmy_vlt_reg{}; + amdsmi_error_count_t dmy_err_cnt{}; amdsmi_ras_err_state_t dmy_ras_err_st; // This can be replaced with ASSERT_EQ() once env. stabilizes diff --git a/tests/amd_smi_test/functional/volt_freq_curv_read.cc b/tests/amd_smi_test/functional/volt_freq_curv_read.cc index 4c1a758fc9..080d8e9a1a 100755 --- a/tests/amd_smi_test/functional/volt_freq_curv_read.cc +++ b/tests/amd_smi_test/functional/volt_freq_curv_read.cc @@ -146,7 +146,7 @@ static void print_amdsmi_od_volt_freq_regions(uint32_t num_regions, void TestVoltCurvRead::Run(void) { amdsmi_status_t err; - amdsmi_od_volt_freq_data_t odv; + amdsmi_od_volt_freq_data_t odv{}; TestBase::Run(); if (setup_failed_) { @@ -177,25 +177,5 @@ void TestVoltCurvRead::Run(void) { err = amdsmi_get_gpu_od_volt_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); } - - if (err == AMDSMI_STATUS_SUCCESS) { - std::cout << "\t**Frequency-voltage curve data:" << std::endl; - print_amdsmi_od_volt_freq_data_t(&odv); - - amdsmi_freq_volt_region_t *regions; - uint32_t num_regions; - regions = new amdsmi_freq_volt_region_t[odv.num_regions]; - ASSERT_TRUE(regions != nullptr); - - num_regions = odv.num_regions; - err = amdsmi_get_gpu_od_volt_curve_regions(processor_handles_[i], &num_regions, regions); - CHK_ERR_ASRT(err) - ASSERT_TRUE(num_regions == odv.num_regions); - - std::cout << "\t**Frequency-voltage curve regions:" << std::endl; - print_amdsmi_od_volt_freq_regions(num_regions, regions); - - delete []regions; - } } } From 051d5a4d427b946e18c23d13919bfb2e8136fe4f Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 2 May 2024 01:40:48 -0500 Subject: [PATCH 09/13] Bump Version to 24.5.2.0 Signed-off-by: Maisam Arif Change-Id: I2f51ed93a356e55156983c56bac293a5d7d3b5c1 --- CMakeLists.txt | 2 +- amdsmi_cli/README.md | 2 +- docs/doxygen/Doxyfile | 2 +- include/amd_smi/amdsmi.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6cf0d289ef..8c49ad3261 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ find_program(GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver") -get_package_version_number("24.5.1" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("24.5.2" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index a152894415..7b2252911e 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -79,7 +79,7 @@ amd-smi will report the version and current platform detected when running the c ~$ amd-smi usage: amd-smi [-h] ... -AMD System Management Interface | Version: 24.5.1.0 | ROCm version: 6.1.1 | Platform: Linux Baremetal +AMD System Management Interface | Version: 24.5.2.0 | ROCm version: 6.1.2 | Platform: Linux Baremetal options: -h, --help show this help message and exit diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index ef62d4d1c2..144fc42d49 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -48,7 +48,7 @@ PROJECT_NAME = AMD SMI # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "24.5.1.0" +PROJECT_NUMBER = "24.5.2.0" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 6502cabc11..f354515ecb 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -154,7 +154,7 @@ typedef enum { #define AMDSMI_LIB_VERSION_MAJOR 5 //! Minor version should be updated for each API change, but without changing headers -#define AMDSMI_LIB_VERSION_MINOR 1 +#define AMDSMI_LIB_VERSION_MINOR 2 //! Release version should be set to 0 as default and can be updated by the PMs for each CSP point release #define AMDSMI_LIB_VERSION_RELEASE 0 From c24d66740e5922605425b4b445846a63bb4d5e87 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 4 Apr 2024 10:06:47 -0500 Subject: [PATCH 10/13] SWDEV-450580 - Fix powercap set Updates: * CLI - Added AMDSMIHelpers.convert_SI_unit() to help conversion of units * API - Reverted to uW for power cap limits * CLI - amd-smi static --limit now includes MIN_POWER * Tests now are all using uW units to keep W conversion to only happen in CLI * Python API now reflects same units as uW (what is seen in amdgpu driver) * CLI - amd-smi metric --power: Fixed power seen on gpu_metrics v1.3 Change-Id: I32d9ba78d0d8806772f0860f9a803a885b3f316a Signed-off-by: Charis Poag --- CHANGELOG.md | 70 ++++++++++++++++ amdsmi_cli/amdsmi_commands.py | 83 +++++++++++-------- amdsmi_cli/amdsmi_helpers.py | 44 ++++++++++ example/amd_smi_drm_example.cc | 10 +-- include/amd_smi/amdsmi.h | 13 +-- py-interface/README.md | 14 ++-- src/amd_smi/amd_smi.cc | 5 -- src/amd_smi/amd_smi_utils.cc | 2 - .../functional/power_cap_read_write.cc | 49 +++++++---- 9 files changed, 212 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eae998bfdb..ad3eba258e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,76 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ***All information listed below is for reference and subject to change.*** +## amd_smi_lib for ROCm 6.2 (Unreleased) + +### Added + +- **Added `MIN_POWER` to output of `amd-smi static --limit`** +This change was to help users to identify what range they can change the power cap of the GPU to. We added this to simplify why a device supports (or does not support) power capping (also known as overdrive). See `amd-smi set -g all --power-cap ` or `amd-smi reset -g all --power-cap`. +```shell +$ amd-smi static --limit +GPU: 0 + LIMIT: + MAX_POWER: 203 W + MIN_POWER: 0 W + SOCKET_POWER: 203 W + SLOWDOWN_EDGE_TEMPERATURE: 100 °C + SLOWDOWN_HOTSPOT_TEMPERATURE: 110 °C + SLOWDOWN_VRAM_TEMPERATURE: 100 °C + SHUTDOWN_EDGE_TEMPERATURE: 105 °C + SHUTDOWN_HOTSPOT_TEMPERATURE: 115 °C + SHUTDOWN_VRAM_TEMPERATURE: 105 °C + +GPU: 1 + LIMIT: + MAX_POWER: 213 W + MIN_POWER: 213 W + SOCKET_POWER: 213 W + SLOWDOWN_EDGE_TEMPERATURE: 109 °C + SLOWDOWN_HOTSPOT_TEMPERATURE: 110 °C + SLOWDOWN_VRAM_TEMPERATURE: 100 °C + SHUTDOWN_EDGE_TEMPERATURE: 114 °C + SHUTDOWN_HOTSPOT_TEMPERATURE: 115 °C + SHUTDOWN_VRAM_TEMPERATURE: 105 °C +``` + +### Changed + +- **`amdsmi_get_power_cap_info` now returns values in uW instead of W** +`amdsmi_get_power_cap_info` will return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). + +### Optimizations + +- N/A + +### Fixed +- **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x** +These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs. +No change in any of our APIs. +```shell +$ amd-smi metric --power +GPU: 0 + POWER: + SOCKET_POWER: 11 W + GFX_VOLTAGE: 768 mV + SOC_VOLTAGE: 925 mV + MEM_VOLTAGE: 1250 mV + POWER_MANAGEMENT: ENABLED + THROTTLE_STATUS: UNTHROTTLED + +GPU: 1 + POWER: + SOCKET_POWER: 17 W + GFX_VOLTAGE: 781 mV + SOC_VOLTAGE: 806 mV + MEM_VOLTAGE: 1250 mV + POWER_MANAGEMENT: ENABLED + THROTTLE_STATUS: UNTHROTTLED +``` +- **Fixed `amdsmitstReadWrite.TestPowerCapReadWrite` test for Navi3X, Navi2X, MI100** +Updates required `amdsmi_get_power_cap_info` to return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). + + ## amd_smi_lib for ROCm 6.1.2 ### Added diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 54539ddd66..6cf63f9b92 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -414,7 +414,11 @@ class AMDSMICommands(): power_limit_error = False power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) max_power_limit = power_cap_info['max_power_cap'] + max_power_limit = AMDSMIHelpers.convert_SI_unit(max_power_limit, AMDSMIHelpers.SI_Unit.MICRO) + min_power_limit = power_cap_info['min_power_cap'] + min_power_limit = AMDSMIHelpers.convert_SI_unit(min_power_limit, AMDSMIHelpers.SI_Unit.MICRO) socket_power_limit = power_cap_info['power_cap'] + socket_power_limit = AMDSMIHelpers.convert_SI_unit(socket_power_limit, AMDSMIHelpers.SI_Unit.MICRO) except amdsmi_exception.AmdSmiLibraryException as e: power_limit_error = True max_power_limit = "N/A" @@ -492,11 +496,18 @@ class AMDSMICommands(): power_unit = 'W' temp_unit_human_readable = '\N{DEGREE SIGN}C' temp_unit_json = 'C' - if self.logger.is_human_readable_format(): - if not power_limit_error: - max_power_limit = f"{max_power_limit} {power_unit}" - socket_power_limit = f"{socket_power_limit} {power_unit}" + if not power_limit_error: + max_power_limit = self.helpers.unit_format(self.logger, + max_power_limit, + power_unit) + min_power_limit = self.helpers.unit_format(self.logger, + min_power_limit, + power_unit) + socket_power_limit = self.helpers.unit_format(self.logger, + socket_power_limit, + power_unit) + if self.logger.is_human_readable_format(): if not slowdown_temp_edge_limit_error: slowdown_temp_edge_limit = f"{slowdown_temp_edge_limit} {temp_unit_human_readable}" if not slowdown_temp_hotspot_limit_error: @@ -509,13 +520,8 @@ class AMDSMICommands(): shutdown_temp_hotspot_limit = f"{shutdown_temp_hotspot_limit} {temp_unit_human_readable}" if not shutdown_temp_vram_limit_error: shutdown_temp_vram_limit = f"{shutdown_temp_vram_limit} {temp_unit_human_readable}" - if self.logger.is_json_format(): - if not power_limit_error: - max_power_limit = {"value" : max_power_limit, - "unit" : power_unit} - socket_power_limit = {"value" : socket_power_limit, - "unit" : power_unit} + if self.logger.is_json_format(): if not slowdown_temp_edge_limit_error: slowdown_temp_edge_limit = {"value" : slowdown_temp_edge_limit, "unit" : temp_unit_json} @@ -538,6 +544,7 @@ class AMDSMICommands(): limit_info = {} # Power limits limit_info['max_power'] = max_power_limit + limit_info['min_power'] = min_power_limit limit_info['socket_power'] = socket_power_limit # Shutdown limits @@ -1326,24 +1333,19 @@ class AMDSMICommands(): for key, value in power_info.items(): if value == 0xFFFF: power_info[key] = "N/A" - elif self.logger.is_human_readable_format(): - if "voltage" in key: - power_info[key] = f"{value} {voltage_unit}" - elif "power" in key: - power_info[key] = f"{value} {power_unit}" - elif self.logger.is_json_format(): - if "voltage" in key: - power_info[key] = {"value" : value, - "unit" : voltage_unit} - elif "power" in key: - power_info[key] = {"value" : value, - "unit" : power_unit} - - power_dict['socket_power'] = power_info['current_socket_power'] - - if power_dict['socket_power'] == "N/A": - # For older gpu's when current power doesn't populate we use the average socket power instead - power_dict['socket_power'] = power_info['average_socket_power'] + elif "voltage" in key: + power_info[key] = self.helpers.unit_format(self.logger, + value, + voltage_unit) + elif "power" in key: + if ((key == "current_socket_power" or key == "average_socket_power") + and value != "N/A"): + power_dict['socket_power'] = self.helpers.unit_format(self.logger, + value, + power_unit) + power_info[key] = self.helpers.unit_format(self.logger, + value, + power_unit) power_dict['gfx_voltage'] = power_info['gfx_voltage'] power_dict['soc_voltage'] = power_info['soc_voltage'] @@ -3478,8 +3480,11 @@ class AMDSMICommands(): power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") min_power_cap = power_cap_info["min_power_cap"] + min_power_cap = AMDSMIHelpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) max_power_cap = power_cap_info["max_power_cap"] + max_power_cap = AMDSMIHelpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) current_power_cap = power_cap_info["power_cap"] + current_power_cap = AMDSMIHelpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) except amdsmi_exception.AmdSmiLibraryException as e: raise ValueError(f"Unable to get power cap info from {gpu_string}") from e @@ -3487,7 +3492,9 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}") elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap: try: - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, args.power_cap * 1000000) + new_power_cap = AMDSMIHelpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e @@ -3882,20 +3889,26 @@ class AMDSMICommands(): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") - default_power_cap = power_cap_info["default_power_cap"] + default_power_cap_in_w = power_cap_info["default_power_cap"] + default_power_cap_in_w = AMDSMIHelpers.convert_SI_unit(default_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap_in_w = power_cap_info["power_cap"] + current_power_cap_in_w = AMDSMIHelpers.convert_SI_unit(current_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) except amdsmi_exception.AmdSmiLibraryException as e: raise ValueError(f"Unable to get power cap info from {gpu_id}") from e - if args.power_cap == default_power_cap: - self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap}") + if current_power_cap_in_w == default_power_cap_in_w: + self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap_in_w}") else: try: - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap * 1000000) + default_power_cap_in_uw = AMDSMIHelpers.convert_SI_unit(default_power_cap_in_w, + AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap_in_uw) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to reset power cap to {default_power_cap} on GPU {gpu_id}") from e - self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap}") + raise ValueError(f"Unable to reset power cap to {default_power_cap_in_w} on GPU {gpu_id}") from e + self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap_in_w}") if multiple_devices: self.logger.store_multiple_device_output() diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 6383969a6a..fc19194727 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -29,6 +29,7 @@ import time from subprocess import run from subprocess import PIPE, STDOUT from typing import List +from enum import Enum from amdsmi_init import * from BDF import BDF @@ -726,3 +727,46 @@ class AMDSMIHelpers(): if logger.is_human_readable_format(): return f"{value} {unit}" return f"{value}" + + class SI_Unit(float, Enum): + GIGA = 1000000000 # 10^9 + MEGA = 1000000 # 10^6 + KILO = 1000 # 10^3 + HECTO = 100 # 10^2 + DEKA = 10 # 10^1 + BASE = 1 # 10^0 + DECI = 0.1 # 10^-1 + CENTI = 0.01 # 10^-2 + MILLI = 0.001 # 10^-3 + MICRO = 0.000001 # 10^-6 + NANO = 0.000000001 # 10^-9 + + def convert_SI_unit(val: float, unit_in: SI_Unit, unit_out = SI_Unit.BASE) -> float: + """This function will convert a value into another + scientific (SI) unit. Defaults unit_out to SI_Unit.BASE + This function returns a float. + + params: + val: float unit to convert + unit_in: Requires using SI_Unit to set current value's SI unit (eg. SI_Unit.MICRO) + unit_out - Requires using SI_Unit to set current value's SI unit + default value is SI_Unit.BASE (eg. SI_Unit.MICRO) + return: + float : converted SI unit of value requested + """ + return val * unit_in / unit_out + + def convert_SI_unit(val: int, unit_in: SI_Unit, unit_out=SI_Unit.BASE) -> int: + """This function will convert a value into another + scientific (SI) unit. Defaults unit_out to SI_Unit.BASE + This function returns a int. + + params: + val: int unit to convert + unit_in: Requires using SI_Unit to set current value's SI unit (eg. SI_Unit.MICRO) + unit_out - Requires using SI_Unit to set current value's SI unit + default value is SI_Unit.BASE (eg. SI_Unit.MICRO) + return: + int : converted SI unit of value requested + """ + return int(float(val) * unit_in / unit_out) diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index 25ac6ade00..92103085e9 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -657,15 +657,15 @@ int main() { CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_power_cap_info:\n"); std::cout << "\t\t Power Cap: " << cap_info.power_cap - << "W\n"; + << " uW\n"; std::cout << "\t\t Default Power Cap: " << cap_info.default_power_cap - << "\n\n"; + << " uW\n\n"; std::cout << "\t\t Dpm Cap: " << cap_info.dpm_cap - << "\n\n"; + << " MHz\n\n"; std::cout << "\t\t Min Power Cap: " << cap_info.min_power_cap - << "\n\n"; + << " uW\n\n"; std::cout << "\t\t Max Power Cap: " << cap_info.max_power_cap - << "\n\n"; + << " uW\n\n"; /// Get GPU Metrics info std::cout << "\n\n"; diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index f354515ecb..80b8417dd3 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -522,11 +522,11 @@ typedef struct { } amdsmi_pcie_info_t; typedef struct { - uint64_t power_cap; - uint64_t default_power_cap; - uint64_t dpm_cap; - uint64_t min_power_cap; - uint64_t max_power_cap; + uint64_t power_cap; //!< current power cap (uW) + uint64_t default_power_cap; //!< default power cap (uW) + uint64_t dpm_cap; //!< dpm power cap (MHz) + uint64_t min_power_cap; //!< minimum power cap (uW) + uint64_t max_power_cap; //!< maximum power cap (uW) uint64_t reserved[3]; } amdsmi_power_cap_info_t; @@ -4615,7 +4615,8 @@ amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_handle, amdsmi_board /** * @brief Returns the power caps as currently configured in the - * system. It is not supported on virtual machine guest + * system. Power in units of uW. + * It is not supported on virtual machine guest * * @platform{gpu_bm_linux} @platform{host} * diff --git a/py-interface/README.md b/py-interface/README.md index 7d1f414565..e49a5699f4 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -414,13 +414,13 @@ Input parameters: Output: Dictionary with fields -Field | Description ----|--- -`power_cap` | power capability -`dpm_cap` | dynamic power management capability -`default_power_cap` | default power capability -`min_power_cap` | min power capability -`max_power_cap` | max power capability +Field | Description | Units +---|---|--- +`power_cap` | power capability | uW +`dpm_cap` | dynamic power management capability | MHz +`default_power_cap` | default power capability | uW +`min_power_cap` | min power capability | uW +`max_power_cap` | max power capability | uW Exceptions that can be thrown by `amdsmi_get_power_cap_info` function: diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index bb88f578cf..cc504ea3b9 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1208,15 +1208,10 @@ amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; - // Dividing by 1000000 to get measurement in Watts - (info->default_power_cap) /= 1000000; status = rsmi_wrapper(rsmi_dev_power_cap_range_get, processor_handle, sensor_ind, &(info->max_power_cap), &(info->min_power_cap)); - // Dividing by 1000000 to get measurement in Watts - (info->max_power_cap) /= 1000000; - (info->min_power_cap) /= 1000000; if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index 13762c3808..10f27ab309 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -201,8 +201,6 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int return AMDSMI_STATUS_API_FAILED; } - // Dividing by 1000000 to get measurement in Watts - *cap /= 1000000; return AMDSMI_STATUS_SUCCESS; } diff --git a/tests/amd_smi_test/functional/power_cap_read_write.cc b/tests/amd_smi_test/functional/power_cap_read_write.cc index dbf4726779..5e1a065a61 100755 --- a/tests/amd_smi_test/functional/power_cap_read_write.cc +++ b/tests/amd_smi_test/functional/power_cap_read_write.cc @@ -89,9 +89,10 @@ void TestPowerCapReadWrite::Close() { void TestPowerCapReadWrite::Run(void) { amdsmi_status_t ret; - uint64_t orig, min, max, new_cap; + uint64_t default_cap, min, max, new_cap, curr_cap; clock_t start, end; double cpu_time_used; + const uint64_t MICRO_CONVERSION = 1000000; TestBase::Run(); if (setup_failed_) { @@ -110,22 +111,24 @@ void TestPowerCapReadWrite::Run(void) { ASSERT_EQ(ret, AMDSMI_STATUS_INVAL); min = info.min_power_cap; max = info.max_power_cap; - orig = info.default_power_cap; + default_cap = info.default_power_cap; + curr_cap = info.power_cap; + new_cap = (max + min)/2; // Check if power cap is within the range // skip the test otherwise - if (orig < min || orig > max) { - std::cout << "Power cap is not within the range. Skipping test for " << dv_ind << std::endl; + if (new_cap < min || new_cap > max) { + std::cout << "Power cap requested (" << new_cap + << " uW) is not within the range. Skipping test for " << dv_ind << std::endl; continue; } - new_cap = (max + min)/2; - IF_VERB(STANDARD) { - std::cout << "Original Power Cap: " << orig << " uW" << std::endl; - std::cout << "Power Cap Range: " << max << " uW to " << min << + std::cout << "[Before Set] Default Power Cap: " << default_cap << " uW" << std::endl; + std::cout << "[Before Set] Current Power Cap: " << curr_cap << " uW" << std::endl; + std::cout << "[Before Set] Power Cap Range [max to min]: " << max << " uW to " << min << " uW" << std::endl; - std::cout << "Setting new cap to " << new_cap << "..." << std::endl; + std::cout << "[Before Set] Setting new cap to " << new_cap << "..." << std::endl; } start = clock(); ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, new_cap); @@ -142,25 +145,35 @@ void TestPowerCapReadWrite::Run(void) { ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info); CHK_ERR_ASRT(ret) - new_cap = info.default_power_cap; + curr_cap = info.power_cap; - // TODO(cfreehil) add some kind of assertion to verify new_cap is correct - // (or within a range) IF_VERB(STANDARD) { - std::cout << "Time spent: " << cpu_time_used << " uS" << std::endl; - std::cout << "New Power Cap: " << new_cap << " uW" << std::endl; - std::cout << "Resetting cap to " << orig << "..." << std::endl; + std::cout << "[After Set] Time spent: " << cpu_time_used << " uS" << std::endl; + std::cout << "[After Set] Current Power Cap: " << curr_cap << " uW" << std::endl; + std::cout << "[After Set] Requested Power Cap: " << new_cap << " uW" << std::endl; + std::cout << "[After Set] Power Cap Range [max to min]: " << max << " uW to " + << min << " uW" << std::endl; + std::cout << "[After Set] Resetting cap to " << default_cap << "..." << std::endl; } + // Confirm in watts the values are equal + ASSERT_EQ(curr_cap/MICRO_CONVERSION, new_cap/MICRO_CONVERSION); - ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, orig); + // Reset to default power cap + ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, default_cap); CHK_ERR_ASRT(ret) ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info); CHK_ERR_ASRT(ret) - new_cap = info.default_power_cap; + curr_cap = info.power_cap; IF_VERB(STANDARD) { - std::cout << "Current Power Cap: " << new_cap << " uW" << std::endl; + std::cout << "[After Reset] Current Power Cap: " << curr_cap << " uW" << std::endl; + std::cout << "[After Reset] Requested Power Cap (default): " << default_cap << " uW" + << std::endl; + std::cout << "[After Reset] Power Cap Range [max to min]: " << max << " uW to " + << min << " uW" << std::endl; } + // Confirm in watts the values are equal + ASSERT_EQ(curr_cap/MICRO_CONVERSION, default_cap/MICRO_CONVERSION); } } From 11c72946eb114683538ad03861594524df8ee0c8 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 2 May 2024 15:27:16 -0500 Subject: [PATCH 11/13] Revert "SWDEV-458102 - Deprecated Voltage Curve API" This reverts commit 1423fb632e340359bd3e4d7e71035112199eb08e. Change-Id: I8a3eaf0a9f28200e09fb35d5260fbc070fe8a4a9 --- CHANGELOG.md | 52 +-- amdsmi_cli/README.md | 3 +- amdsmi_cli/amdsmi_commands.py | 38 +- amdsmi_cli/amdsmi_parser.py | 2 + include/amd_smi/amdsmi.h | 13 +- py-interface/README.md | 4 +- rocm_smi/include/rocm_smi/rocm_smi.h | 6 +- rocm_smi/include/rocm_smi/rocm_smi_utils.h | 330 +----------------- rocm_smi/src/rocm_smi.cc | 159 +++++---- rocm_smi/src/rocm_smi_utils.cc | 9 +- .../functional/mutual_exclusion.cc | 8 +- .../functional/volt_freq_curv_read.cc | 22 +- 12 files changed, 192 insertions(+), 454 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad3eba258e..6d7275eb7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ***All information listed below is for reference and subject to change.*** -## amd_smi_lib for ROCm 6.2 (Unreleased) +## amd_smi_lib for ROCm 6.1.2 ### Added @@ -42,11 +42,22 @@ GPU: 1 - **`amdsmi_get_power_cap_info` now returns values in uW instead of W** `amdsmi_get_power_cap_info` will return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). +- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. + ### Optimizations -- N/A +- **Updated `amd-smi monitor --pcie` output** +The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: + +```shell +$ amd-smi monitor --pcie +GPU PCIE_BW + 0 26 Mb/s +``` ### Fixed + - **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x** These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs. No change in any of our APIs. @@ -70,45 +81,13 @@ GPU: 1 POWER_MANAGEMENT: ENABLED THROTTLE_STATUS: UNTHROTTLED ``` + - **Fixed `amdsmitstReadWrite.TestPowerCapReadWrite` test for Navi3X, Navi2X, MI100** Updates required `amdsmi_get_power_cap_info` to return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). - -## amd_smi_lib for ROCm 6.1.2 - -### Added - -- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** -Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. - -### Changed - -- **Deprecated Volt Curve APIs** -The latest amdgpu driver has dropped support for getting and setting volt curve information. amdsmi_set_gpu_od_volt_info() & amdsmi_get_gpu_od_volt_curve_regions() have been deprecated with amdsmi_get_gpu_od_volt_info() now no longer populating voltage curve frequencies. - -- **Removed `amd-smi metric --voltage-curve` from CLI Tool** -Due to amdgpu driver dropping support for voltage curve, the CLI option has been removed as well. - -### Optimizations - -- **Updated `amd-smi monitor --pcie` output** -The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: - -```shell -$ amd-smi monitor --pcie -GPU PCIE_BW - 0 26 Mb/s -``` - -### Fixed - - **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. -### Known issues - -- None - ## amd_smi_lib for ROCm 6.1.1 ### Added @@ -428,9 +407,6 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json ### Fixed -- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** -Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. - - **Fix for GPU reset error on non-amdgpu cards** Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix updates CLI to target only AMD ASICs. diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 7b2252911e..affc7a476b 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -269,7 +269,7 @@ Command Modifiers: ~$ amd-smi metric --help usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-m] [-u] [-p] [-c] [-t] - [-P] [-e] [-k] [-f] [-o] [-l] [-x] [-E] [--cpu-power-metrics] + [-P] [-e] [-k] [-f] [-C] [-o] [-l] [-x] [-E] [--cpu-power-metrics] [--cpu-prochot] [--cpu-freq-metrics] [--cpu-c0-res] [--cpu-lclk-dpm-level NBIOID] [--cpu-pwr-svi-telemtry-rails] [--cpu-io-bandwidth IO_BW LINKID_NAME] @@ -313,6 +313,7 @@ Metric arguments: -e, --ecc Total number of ECC errors -k, --ecc-blocks Number of ECC errors per block -f, --fan Current fan speed + -C, --voltage-curve Display voltage curve -o, --overdrive Current GPU clock overdrive level -l, --perf-level Current DPM performance level -x, --xgmi-err XGMI error information since last read diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 6cf63f9b92..460a0f84d3 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1115,7 +1115,7 @@ class AMDSMICommands(): def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, - fan=None, overdrive=None, perf_level=None, + fan=None, voltage_curve=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None,): """Get Metric information for target gpu @@ -1136,6 +1136,7 @@ class AMDSMICommands(): ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. pcie (bool, optional): Value override for args.pcie. Defaults to None. fan (bool, optional): Value override for args.fan. Defaults to None. + voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. overdrive (bool, optional): Value override for args.overdrive. Defaults to None. perf_level (bool, optional): Value override for args.perf_level. Defaults to None. xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. @@ -1194,6 +1195,8 @@ class AMDSMICommands(): if self.helpers.is_baremetal() and self.helpers.is_linux(): if fan: args.fan = fan + if voltage_curve: + args.voltage_curve = voltage_curve if overdrive: args.overdrive = overdrive if perf_level: @@ -1202,8 +1205,8 @@ class AMDSMICommands(): args.xgmi_err = xgmi_err if energy: args.energy = energy - current_platform_args += ["fan", "overdrive", "perf_level", "xgmi_err", "energy"] - current_platform_values += [args.fan, args.overdrive, args.perf_level, args.xgmi_err, args.energy] + current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"] + current_platform_values += [args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy] if self.helpers.is_hypervisor(): if schedule: @@ -1785,6 +1788,26 @@ class AMDSMICommands(): logging.debug("Failed to get fan rpms for gpu %s | %s", args.gpu, e.get_error_info()) values_dict["fan"] = fan_dict + if "voltage_curve" in current_platform_args: + if args.voltage_curve: + try: + od_volt = amdsmi_interface.amdsmi_get_gpu_od_volt_info(args.gpu) + + voltage_point_dict = {} + + for point in range(3): + if isinstance(od_volt, dict): + frequency = int(od_volt["curve.vc_points"][point].frequency / 1000000) + voltage = int(od_volt["curve.vc_points"][point].voltage) + else: + frequency = 0 + voltage = 0 + voltage_point_dict[f'voltage_point_{point}'] = f"{frequency} Mhz {voltage} mV" + + values_dict['voltage_curve'] = voltage_point_dict + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['voltage_curve'] = "N/A" + logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info()) if "overdrive" in current_platform_args: if args.overdrive: try: @@ -2300,7 +2323,7 @@ class AMDSMICommands(): def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, - fan=None, overdrive=None, perf_level=None, + fan=None, voltage_curve=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None, cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None, @@ -2329,6 +2352,7 @@ class AMDSMICommands(): ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. pcie (bool, optional): Value override for args.pcie. Defaults to None. fan (bool, optional): Value override for args.fan. Defaults to None. + voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. overdrive (bool, optional): Value override for args.overdrive. Defaults to None. perf_level (bool, optional): Value override for args.perf_level. Defaults to None. xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. @@ -2382,7 +2406,7 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock", - "temperature", "ecc", "ecc_blocks", "pcie", "fan", + "temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule", "guard", "guest_data", "fb_usage", "xgmi"] for attr in gpu_attributes: @@ -2455,7 +2479,7 @@ class AMDSMICommands(): self.metric_gpu(args, multiple_devices, watching_output, gpu, usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, - fan, overdrive, perf_level, + fan, voltage_curve, overdrive, perf_level, xgmi_err, energy, mem_usage, schedule, guard, guest_data, fb_usage, xgmi) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized @@ -2490,7 +2514,7 @@ class AMDSMICommands(): self.metric_gpu(args, multiple_devices, watching_output, gpu, usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, - fan, overdrive, perf_level, + fan, voltage_curve, overdrive, perf_level, xgmi_err, energy, mem_usage, schedule) diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index af22db7137..4b11188b03 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -699,6 +699,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Linux Baremetal platforms fan_help = "Current fan speed" + vc_help = "Display voltage curve" overdrive_help = "Current GPU clock overdrive level" perf_level_help = "Current DPM performance level" xgmi_err_help = "XGMI error information since last read" @@ -769,6 +770,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args for Linux Baremetal Systems if self.helpers.is_baremetal() and self.helpers.is_linux(): metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) + metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 80b8417dd3..dcfdfcf7e6 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -1253,13 +1253,12 @@ typedef struct { typedef struct { amdsmi_range_t curr_sclk_range; //!< The current SCLK frequency range amdsmi_range_t curr_mclk_range; //!< The current MCLK frequency range; - //!< (upper bound only) + //!< (upper bound only) amdsmi_range_t sclk_freq_limits; //!< The range possible of SCLK values amdsmi_range_t mclk_freq_limits; //!< The range possible of MCLK values /** * @brief The current voltage curve - * @deprecated ::Voltage curve support has been deprecated by the driver */ amdsmi_od_volt_curve_t curve; uint32_t num_regions; //!< The number of voltage curve regions @@ -2966,7 +2965,7 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle); /** - * @brief This function retrieves the overdrive GFX & MCLK information. It is + * @brief This function retrieves the voltage/frequency curve information. It is * not supported on virtual machine guest * * @platform{gpu_bm_linux} @@ -3167,9 +3166,6 @@ amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_han * * @platform{gpu_bm_linux} * - * @deprecated ::Voltage curve information is no longer supported by the - * amdgpu driver; this includes the ability to set voltage curve regions - * * @details Given a processor handle @p processor_handle, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -3196,9 +3192,6 @@ amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_ha * * @platform{gpu_bm_linux} * - * @deprecated ::Voltage curve information is no longer supported by the - * amdgpu driver; this includes the number of valid voltage regions - * * @details Given a processor handle @p processor_handle, a pointer to an unsigned integer * @p num_regions and a buffer of ::amdsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current @@ -3509,7 +3502,7 @@ amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle process * @platform{gpu_bm_linux} @platform{guest_1vf} * * @details Given a processor handle @p processor_handle, and a sclean flag @p sclean, - * this function will clear the SRAM data of this processor. This can be called between + * this function will clear the SRAM data of this processor. This can be called between * user logins to prevent information leak. * * @note This function requires root access diff --git a/py-interface/README.md b/py-interface/README.md index e49a5699f4..6bbe75be7f 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -1591,7 +1591,7 @@ except AmdSmiException as e: ### amdsmi_set_gpu_od_clk_info -Description: **deprecated** This function sets the clock frequency information +Description: This function sets the clock frequency information It is not supported on virtual machine guest Input parameters: @@ -2306,7 +2306,7 @@ except AmdSmiException as e: ### amdsmi_get_gpu_od_volt_curve_regions -Description: **deprecated** This function will retrieve the current valid regions in the +Description: This function will retrieve the current valid regions in the frequency/voltage space. It is not supported on virtual machine guest Input parameters: diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 8797cf1b5f..0fafa31c8f 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3058,8 +3058,6 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, /** * @brief This function sets 1 of the 3 voltage curve points. * - * @deprecated This function is deprecated due to driver changes. - * * @details Given a device index @p dv_ind, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -3085,8 +3083,6 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, * @brief This function will retrieve the current valid regions in the * frequency/voltage space. * - * @deprecated This function is deprecated due to driver changes. - * * @details Given a device index @p dv_ind, a pointer to an unsigned integer * @p num_regions and a buffer of ::rsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current @@ -3452,7 +3448,7 @@ rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, uint32_t pisolate); /** - * @brief Clear the GPU SRAM data + * @brief Clear the GPU SRAM data * * * @details Given a device index @p dv_ind, this function will clear the diff --git a/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/rocm_smi/include/rocm_smi/rocm_smi_utils.h index 32e6bdeefc..67d9d8b8d8 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -45,17 +45,14 @@ #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include +#include #include +#include +#include +#include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -128,33 +125,13 @@ std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions, bool is_sudo_user(); rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, std::string *gfx_version); - -std::string leftTrim(const std::string &s); -std::string rightTrim(const std::string &s); -std::string trim(const std::string &s); -std::string removeNewLines(const std::string &s); - -std::string removeString(const std::string origStr, - const std::string &removeMe); template - std::string print_int_as_hex(T i, bool showHexNotation = true, - int overloadBitSize = 0) { + std::string print_int_as_hex(T i, bool showHexNotation = true) { std::stringstream ss; if (showHexNotation) { - if (overloadBitSize == 0) { - ss << "0x" << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0'); - } else { - // 8 bits per 1 byte - int byteSize = (overloadBitSize / 8) * 2; - ss << "0x" << std::hex << std::setw(byteSize) << std::setfill('0'); - } + ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; } else { - if (overloadBitSize == 0) { - ss << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0'); - } else { - int byteSize = (overloadBitSize / 8) * 2; - ss << std::hex << std::setw(byteSize) << std::setfill('0'); - } + ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; } if (std::is_same::value) { @@ -185,8 +162,7 @@ std::string print_unsigned_hex_and_int(T i, std::string heading="") { } ss << "Hex (MSB): " << print_int_as_hex(i) << ", " << "Unsigned int: " << print_unsigned_int(i) << ", " - << "Byte Size: " << sizeof(T) << ", " - << "Bits: " << sizeof(T) * 8; // 8 bits per 1 byte + << "Byte Size: " << sizeof(T); return ss.str(); } @@ -307,290 +283,8 @@ class ScopedAcquire { // In VM environment, the /proc/cpuinfo set hypervisor flag by default bool is_vm_guest(); - -// -enum class TagSplitterPositional_t -{ - kFIRST, - kBETWEEN, - kLAST, - kNONE, -}; - -template -class TagTextContents_t -{ - public: - using TextLines_t = std::vector; - using PrimaryList_t = std::vector; - using SecondaryList_t = std::vector; - using PrimaryKeyTbl_t = std::map; - using SecondaryKeyTbl_t = std::map; - using StructuredKeysTbl_t = std::map>; - - // - TagTextContents_t() = default; - TagTextContents_t(const TagTextContents_t&) = delete; - TagTextContents_t(TagTextContents_t&&) = delete; - TagTextContents_t& operator=(const TagTextContents_t&) = delete; - TagTextContents_t& operator=(TagTextContents_t&&) = delete; - - explicit TagTextContents_t(const TextLines_t& text_content) - : m_text_content(text_content) {} - - TagTextContents_t& set_text_content(const TextLines_t& text_content) - { - m_text_content = text_content; - } - - TagTextContents_t& set_title_terminator(const std::string& title_mark, - TagSplitterPositional_t title_mark_position) { - m_title_mark = title_mark; - m_title_mark_position = title_mark_position; - - return *this; - } - - TagTextContents_t& set_key_data_splitter(const std::string& line_splitter_mark, - TagSplitterPositional_t line_mark_position) { - m_line_splitter_mark = line_splitter_mark; - m_line_mark_position = line_mark_position; - - return *this; - } - - TagTextContents_t& structure_content() { - // Sanitizes the content. - if (!m_text_content.empty()) { - std::for_each(m_text_content.begin(), m_text_content.end(), trim); - section_title_lookup(); - section_data_lookup(); - } - - return *this; - } - - decltype(auto) get_title_size() { - return m_primary.size(); - } - - decltype(auto) get_structured_subkeys_size(const PrimaryKeyType& prim_key) { - return m_structured[prim_key].size(); - } - - decltype(auto) contains_title_key(const PrimaryKeyType& key) { - return (m_primary.find(key) != m_primary.end()); - } - - decltype(auto) contains_structured_key(const PrimaryKeyType& prim_key, - const SecondaryKeyType& sec_key) { - if (auto first_key_itr = m_structured.find(prim_key); - first_key_itr != m_structured.end()) { - if (auto sec_key_itr = first_key_itr->second.find(sec_key); - sec_key_itr != first_key_itr->second.end()) { - return true; - } - } - - return false; - } - - decltype(auto) get_structured_value_by_keys(const PrimaryKeyType& prim_key, - const SecondaryKeyType& sec_key, - bool is_value_id = true) { - if (auto first_key_itr = m_structured.find(prim_key); - first_key_itr != m_structured.end()) { - if (auto sec_key_itr = first_key_itr->second.find(sec_key); - sec_key_itr != first_key_itr->second.end()) { - SecondaryDataType key_value{}; - if (is_value_id) { - key_value = SecondaryDataType(sec_key_itr->first) + " "; - } - key_value += sec_key_itr->second; - return key_value; - } - } - - return SecondaryDataType{}; - } - - decltype(auto) get_structured_data_subkey_by_position(const PrimaryKeyType& prim_key, - uint32_t key_position) { - auto key_counter = uint32_t(0); - SecondaryKeyType data_key{}; - if (key_position < (get_structured_subkeys_size(prim_key))) { - for (const auto& [sec_key, sec_value] : m_structured[prim_key]) { - if (key_counter == key_position) { - data_key = static_cast(sec_key); - return data_key; - } - ++key_counter; - } - } - - return data_key; - } - - decltype(auto) get_structured_data_subkey_first(const PrimaryKeyType& prim_key) { - return (get_structured_value_by_keys(prim_key, - get_structured_data_subkey_by_position(prim_key, 0))); - } - - decltype(auto) get_structured_data_subkey_last(const PrimaryKeyType& prim_key) { - return (get_structured_value_by_keys(prim_key, get_structured_data_subkey_by_position(prim_key, - (get_structured_subkeys_size(prim_key) - 1)))); - } - - void reset() { - m_text_content.clear(); - m_primary.clear(); - m_structured.clear(); - m_title_mark.clear(); - m_line_splitter_mark.clear(); - m_title_mark_position = TagSplitterPositional_t::kNONE; - m_line_mark_position = TagSplitterPositional_t::kNONE; - } - - decltype(auto) dump_structured_content() { - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; - ostrstream << "** Primary Table **" << "\n"; - for (const auto& [key, values] : m_primary) { - ostrstream << "key: " << key << " values: " << values.size() << "\n"; - for (const auto& value : values) { - ostrstream << "\t value: " << value << "\n"; - } - } - - ostrstream << "\n ** Structured Table **" << "\n"; - for (const auto& [prim_key, prim_values] : m_structured) { - ostrstream << "key: " << prim_key << "\n"; - for (const auto& [sec_key, sec_value] : prim_values) { - ostrstream << "\t key: " << sec_key << " -> " << sec_value << "\n"; - } - } - ostrstream << "\n\n"; - - return ostrstream.str(); - } - - - private: - TextLines_t m_text_content; - PrimaryKeyTbl_t m_primary; - StructuredKeysTbl_t m_structured; - std::string m_title_mark; - std::string m_line_splitter_mark; - TagSplitterPositional_t m_title_mark_position; - TagSplitterPositional_t m_line_mark_position; - - // - // Note: Organizes table with Title as a Key, and a list of values. - // - decltype(auto) section_title_lookup() { - if (m_title_mark.empty() || - m_title_mark_position == TagSplitterPositional_t::kNONE) { - return; - } - - // - // Note: - // - top_title_line: Left pointer for the sliding window - // - bottom_title_line: Right pointer for the sliding window - // - auto top_title_line = uint32_t(std::numeric_limits::max()); - auto bottom_title_line = uint32_t(std::numeric_limits::max()); - auto line_counter = uint32_t(0); - - // - // Note: This whole interval/window where the section/title starts, and where it ends. - // - auto update_primary_tbl = [&](const uint32_t& from_line, const uint32_t& to_line) { - auto key = static_cast(m_text_content[from_line]); - for (auto line_num(from_line + 1); line_num < to_line; ++line_num) { - if ((line_num < m_text_content.size()) && !m_text_content[line_num].empty()) { - m_primary[key].push_back(m_text_content[line_num]); - } - } - }; - - auto adjust_sliding_window = [&](const uint32_t& title_line) { - // First time top_title_line gets adjusted. - if (top_title_line == uint32_t(std::numeric_limits::max())) { - top_title_line = title_line; - bottom_title_line = top_title_line; - return; - } - if (title_line > bottom_title_line) { - bottom_title_line = title_line; - update_primary_tbl(top_title_line, bottom_title_line); - top_title_line = bottom_title_line; - } - }; - - for (const auto& line : m_text_content) { - auto was_title_found{false}; - switch (m_title_mark_position) { - case TagSplitterPositional_t::kFIRST: - // Section/Title Mark was found at the first position - if (line.find_first_of(m_title_mark.c_str()) == 0) { - was_title_found = true; - } - break; - - case TagSplitterPositional_t::kLAST: - // Section/Title Mark was found at the last position - if ((line.find_last_of(m_title_mark.c_str()) + 1) == line.size()) { - was_title_found = true; - } - break; - - default: - break; - } - - if (was_title_found) { - adjust_sliding_window(line_counter); - } - ++line_counter; - } - - // Any remaining elements? - if (line_counter > bottom_title_line) { - update_primary_tbl(bottom_title_line, (line_counter - 1)); - } - } - - decltype(auto) section_data_lookup() { - if (m_line_splitter_mark.empty() || - m_line_mark_position == TagSplitterPositional_t::kNONE) { - return; - } - - // - // Note: Organizes table with Title as a Key, a Key/ID for values and values. - // It takes into consideration the initial constraints were all good and - // that the primary table has been populated. - for (const auto& [prim_key, prim_values] : m_primary) { - for (const auto& value : prim_values) { - if (auto mark_pos = value.find_first_of(m_line_splitter_mark.c_str()); - mark_pos != std::string::npos) { - auto sec_key = trim(value.substr(0, mark_pos + 1)); - auto sec_data = trim(value.substr((mark_pos + 1), value.size())); - if (!sec_key.empty()) { - m_structured[prim_key].insert(std::make_pair(sec_key, sec_data)); - } - } - } - } - } - -}; - -using TextFileTagContents_t = TagTextContents_t; - +// trim a string +std::string trim(const std::string &s); } // namespace smi } // namespace amd diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index aa5f30d9d1..dd8e903328 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -1415,6 +1415,17 @@ For the new format, GFXCLK field will show min and max values(0/1). If the curre frequency in neither min/max but lies within the range, this is indicated by an additional value followed by * at index 1 and max value at index 2. */ +constexpr uint32_t kOD_SCLK_label_array_index = 0; +constexpr uint32_t kOD_MCLK_label_array_index = + kOD_SCLK_label_array_index + 3; +constexpr uint32_t kOD_VDDC_CURVE_label_array_index = + kOD_MCLK_label_array_index + 2; +constexpr uint32_t kOD_OD_RANGE_label_array_index = + kOD_VDDC_CURVE_label_array_index + 4; +constexpr uint32_t kOD_VDDC_CURVE_start_index = + kOD_OD_RANGE_label_array_index + 3; +// constexpr uint32_t kOD_VDDC_CURVE_num_lines = +// kOD_VDDC_CURVE_start_index + 4; constexpr uint32_t kMIN_VALID_LINES = 2; static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, @@ -1439,75 +1450,62 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - // - const std::string kTAG_OD_SCLK{"OD_SCLK:"}; - const std::string kTAG_GFXCLK{"GFXCLK:"}; - const std::string KTAG_OD_MCLK{"OD_MCLK:"}; - const std::string KTAG_MCLK{"MCLK:"}; - const std::string KTAG_FIRST_FREQ_IDX{"0:"}; - amd::smi::TextFileTagContents_t txt_power_dev_od_voltage(val_vec); - txt_power_dev_od_voltage - .set_title_terminator(":", amd::smi::TagSplitterPositional_t::kLAST) - .set_key_data_splitter(":", amd::smi::TagSplitterPositional_t::kBETWEEN) - .structure_content(); - - // - // Note: We must have minimum of 'GFXCLK:' && 'MCLK:' OR: - // 'OD_SCLK:' && 'OD_MCLK:' tags. - if (txt_power_dev_od_voltage.get_title_size() < kMIN_VALID_LINES) { - return rsmi_status_t::RSMI_STATUS_NO_DATA; + assert(val_vec[kOD_SCLK_label_array_index] == "OD_SCLK:" || + val_vec[kOD_SCLK_label_array_index] == "GFXCLK:"); + if ((val_vec[kOD_SCLK_label_array_index] != "OD_SCLK:") && + (val_vec[kOD_SCLK_label_array_index] != "GFXCLK:")) { + return RSMI_STATUS_UNEXPECTED_DATA; } - // Note: For debug builds/purposes only. - assert(txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) || - txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)); - // Note: For release builds/purposes. - if (!txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) && - !txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)) { - return rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + + // find last_item but skip empty lines + int last_item = val_vec.size()-1; + while (val_vec[last_item].empty() || val_vec[last_item][0] == 0) + last_item--; + + p->curr_sclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_SCLK_label_array_index + 1); + p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_SCLK_label_array_index + 2); + + if (val_vec.size() < (kOD_MCLK_label_array_index + 1)) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + // The condition below checks if it is the old style or new style format. + if (val_vec[kOD_MCLK_label_array_index] == "OD_MCLK:") { + p->curr_mclk_range.lower_bound = 0; + p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_MCLK_label_array_index + 1); + } else if (val_vec[kOD_MCLK_label_array_index] == "MCLK:") { + p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_MCLK_label_array_index + 1); + // the upper memory frequency is the last + p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, last_item); + return RSMI_STATUS_SUCCESS; + } else { + if (val_vec.size() < (kOD_MCLK_label_array_index + 3)) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + if (val_vec[kOD_MCLK_label_array_index + 1] == "MCLK:") { + p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_SCLK_label_array_index + 3); + p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_MCLK_label_array_index + 2); + // the upper memory frequency is the last + p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, last_item); + return RSMI_STATUS_SUCCESS; + } + return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - // Note: Quick helpers for getting 1st and last elements found - auto build_lower_bound = [&](const std::string& prim_key) { - auto lower_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_first(prim_key); - return std::vector{lower_bound_data}; - }; - - auto build_upper_bound = [&](const std::string& prim_key) { - auto upper_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_last(prim_key); - return std::vector{upper_bound_data}; - }; - - // Validates 'OD_SCLK' is in the structure - if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); - p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); - - // Validates 'OD_MCLK' is in the structure - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); - p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); - } + if (val_vec.size() < kOD_VDDC_CURVE_label_array_index) { + return RSMI_STATUS_UNEXPECTED_SIZE; } - // Validates 'GFXCLK' is in the structure - else if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0); - p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0); - // Validates 'MCLK' is in the structure - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0); - p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0); - } - } - else { - return RSMI_STATUS_NOT_YET_IMPLEMENTED; - } - p->num_regions = 0; + p->num_regions = + static_cast((val_vec.size()) / 2); return RSMI_STATUS_SUCCESS; CATCH @@ -1676,6 +1674,30 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, } +static void get_vc_region(uint32_t start_ind, + std::vector *val_vec, rsmi_freq_volt_region_t *p) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + assert(p != nullptr); + assert(val_vec != nullptr); + THROW_IF_NULLPTR_DEREF(p) + THROW_IF_NULLPTR_DEREF(val_vec) + + // There must be at least 1 region to read in + assert(val_vec->size() >= kOD_OD_RANGE_label_array_index + 2); + assert((*val_vec)[kOD_OD_RANGE_label_array_index] == "OD_RANGE:"); + if ((val_vec->size() < kOD_OD_RANGE_label_array_index + 2) || + ((*val_vec)[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") ) { + ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA); + LOG_TRACE(ss); + throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__); + } + od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); + od_value_pair_str_to_range((*val_vec)[start_ind + 1], &p->volt_range); +} + /* * num_regions [inout] on calling, the number of regions requested to be read * in. At completion, the number of regions actually read in @@ -1707,20 +1729,23 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, // This is a work-around to handle systems where kDevPowerODVoltage is not // fully supported yet. - if (val_vec.size() < kMIN_VALID_LINES) { + if (val_vec.size() < 2) { ss << __PRETTY_FUNCTION__ - << " | Issue: val_vec.size() < " << kMIN_VALID_LINES << "; returning " + << " | Issue: val_vec.size() < 2" << "; returning " << getRSMIStatusString(RSMI_STATUS_NOT_YET_IMPLEMENTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_YET_IMPLEMENTED; } uint32_t val_vec_size = static_cast(val_vec.size()); + assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0); + ss << __PRETTY_FUNCTION__ << " | val_vec_size = " << std::dec - << val_vec_size; + << val_vec_size + << " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index; LOG_DEBUG(ss); - *num_regions = 0; + *num_regions = std::min((val_vec_size) / 2, *num_regions); return RSMI_STATUS_SUCCESS; CATCH diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index 45dd3fe40f..61ec4243dc 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -1134,6 +1134,14 @@ std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) { ss << pt_rng_Mhz("\t**Current SCLK frequency range: ", &odv->curr_sclk_range); ss << pt_rng_Mhz("\t**Current MCLK frequency range: ", &odv->curr_mclk_range); + ss << pt_rng_Mhz("\t**Min/Max Possible SCLK frequency range: ", + &odv->sclk_freq_limits); + ss << pt_rng_Mhz("\t**Min/Max Possible MCLK frequency range: ", + &odv->mclk_freq_limits); + + ss << "\t**Current Freq/Volt. curve: " << "\n"; + ss << pt_vddc_curve(&odv->curve); + ss << "\t**Number of Freq./Volt. regions: " << odv->num_regions << "\n\n"; return ss.str(); } @@ -1216,6 +1224,5 @@ std::queue getAllDeviceGfxVers() { return deviceGfxVersions; } - } // namespace smi } // namespace amd diff --git a/tests/amd_smi_test/functional/mutual_exclusion.cc b/tests/amd_smi_test/functional/mutual_exclusion.cc index e5578619f1..48bbe82934 100755 --- a/tests/amd_smi_test/functional/mutual_exclusion.cc +++ b/tests/amd_smi_test/functional/mutual_exclusion.cc @@ -183,10 +183,10 @@ void TestMutualExclusion::Run(void) { int64_t dmy_i64 = 0; char dmy_str[10]; amdsmi_dev_perf_level_t dmy_perf_lvl; - amdsmi_frequencies_t dmy_freqs{}; - amdsmi_od_volt_freq_data_t dmy_od_volt{}; - amdsmi_freq_volt_region_t dmy_vlt_reg{}; - amdsmi_error_count_t dmy_err_cnt{}; + amdsmi_frequencies_t dmy_freqs; + amdsmi_od_volt_freq_data_t dmy_od_volt; + amdsmi_freq_volt_region_t dmy_vlt_reg; + amdsmi_error_count_t dmy_err_cnt; amdsmi_ras_err_state_t dmy_ras_err_st; // This can be replaced with ASSERT_EQ() once env. stabilizes diff --git a/tests/amd_smi_test/functional/volt_freq_curv_read.cc b/tests/amd_smi_test/functional/volt_freq_curv_read.cc index 080d8e9a1a..4c1a758fc9 100755 --- a/tests/amd_smi_test/functional/volt_freq_curv_read.cc +++ b/tests/amd_smi_test/functional/volt_freq_curv_read.cc @@ -146,7 +146,7 @@ static void print_amdsmi_od_volt_freq_regions(uint32_t num_regions, void TestVoltCurvRead::Run(void) { amdsmi_status_t err; - amdsmi_od_volt_freq_data_t odv{}; + amdsmi_od_volt_freq_data_t odv; TestBase::Run(); if (setup_failed_) { @@ -177,5 +177,25 @@ void TestVoltCurvRead::Run(void) { err = amdsmi_get_gpu_od_volt_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); } + + if (err == AMDSMI_STATUS_SUCCESS) { + std::cout << "\t**Frequency-voltage curve data:" << std::endl; + print_amdsmi_od_volt_freq_data_t(&odv); + + amdsmi_freq_volt_region_t *regions; + uint32_t num_regions; + regions = new amdsmi_freq_volt_region_t[odv.num_regions]; + ASSERT_TRUE(regions != nullptr); + + num_regions = odv.num_regions; + err = amdsmi_get_gpu_od_volt_curve_regions(processor_handles_[i], &num_regions, regions); + CHK_ERR_ASRT(err) + ASSERT_TRUE(num_regions == odv.num_regions); + + std::cout << "\t**Frequency-voltage curve regions:" << std::endl; + print_amdsmi_od_volt_freq_regions(num_regions, regions); + + delete []regions; + } } } From 733ec3cd2035c00388aeed1d3936c089150c2325 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 2 May 2024 16:16:29 -0500 Subject: [PATCH 12/13] Updated Changelog with process isolation updates Signed-off-by: Maisam Arif Change-Id: Ic773137ff05b1819f60d42b8a933ef6ebb9addec --- CHANGELOG.md | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d7275eb7a..8411ace100 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,15 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Added +- **Added process isolation and clean shader APIs and CLI commands** +Added APIs CLI and APIs to address LeftoverLocals security issues. Allowing clearing the sram data and setting process isolation on a per GPU basis. New APIs: + - `amdsmi_get_gpu_process_isolation()` + - `amdsmi_set_gpu_process_isolation()` + - `amdsmi_set_gpu_clear_sram_data()` + - **Added `MIN_POWER` to output of `amd-smi static --limit`** -This change was to help users to identify what range they can change the power cap of the GPU to. We added this to simplify why a device supports (or does not support) power capping (also known as overdrive). See `amd-smi set -g all --power-cap ` or `amd-smi reset -g all --power-cap`. +This change was to help users to identify what range they can change the power cap of the GPU to. We added this to simplify why a device supports (or does not support) power capping (also known as overdrive). See `amd-smi set -g all --power-cap ` or `amd-smi reset -g all --power-cap`. + ```shell $ amd-smi static --limit GPU: 0 @@ -61,6 +68,7 @@ GPU PCIE_BW - **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x** These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs. No change in any of our APIs. + ```shell $ amd-smi metric --power GPU: 0 @@ -274,7 +282,8 @@ GPU: 0 - **Updated `amd-smi topology --json` to align with host/guest** Topology's `--json` output now is changed to align with output reported bt host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below. -*Previous format:* +*Previous format:* + ```shell $ amd-smi topology --json [ @@ -328,6 +337,7 @@ $ amd-smi topology --json ``` *New format:* + ```shell $ amd-smi topology --json [ @@ -359,6 +369,7 @@ $ amd-smi topology --json ... ] ``` + ```shell $ /opt/rocm/bin/amd-smi topology -a -t --json [ From bf6fc51f4f50ec485563d861d155d50077185dbb Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 2 May 2024 16:38:58 -0500 Subject: [PATCH 13/13] Moved Changelog fixes to correspond with release Signed-off-by: Maisam Arif Change-Id: I28c91f63ceb5d635d588e3d1d5ec1a385ddc467f --- CHANGELOG.md | 82 ++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8411ace100..84ead71a7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,46 @@ GPU: 1 - **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. +- **Updated `amd-smi metric --ecc-blocks` output** +The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for: + +``` shell +$ amd-smi metric --ecc-block +GPU: 0 + ECC_BLOCKS: + UMC: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + SDMA: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + GFX: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + MMHUB: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + PCIE_BIF: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + HDP: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + XGMI_WAFL: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 +``` + +- **Removed `amdsmi_get_gpu_process_info` from python library** +amdsmi_get_gpu_process_info was removed from the C library in an earlier build, but the API was still in the python interface + ### Optimizations - **Updated `amd-smi monitor --pcie` output** @@ -104,46 +144,6 @@ Previously python interface calls to populated bad pages resulted in a `ValueErr ### Changed -- **Updated `amd-smi metric --ecc-blocks` output** -The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for: - -``` shell -$ amd-smi metric --ecc-block -GPU: 0 - ECC_BLOCKS: - UMC: - CORRECTABLE_COUNT: 0 - UNCORRECTABLE_COUNT: 0 - DEFERRED_COUNT: 0 - SDMA: - CORRECTABLE_COUNT: 0 - UNCORRECTABLE_COUNT: 0 - DEFERRED_COUNT: 0 - GFX: - CORRECTABLE_COUNT: 0 - UNCORRECTABLE_COUNT: 0 - DEFERRED_COUNT: 0 - MMHUB: - CORRECTABLE_COUNT: 0 - UNCORRECTABLE_COUNT: 0 - DEFERRED_COUNT: 0 - PCIE_BIF: - CORRECTABLE_COUNT: 0 - UNCORRECTABLE_COUNT: 0 - DEFERRED_COUNT: 0 - HDP: - CORRECTABLE_COUNT: 0 - UNCORRECTABLE_COUNT: 0 - DEFERRED_COUNT: 0 - XGMI_WAFL: - CORRECTABLE_COUNT: 0 - UNCORRECTABLE_COUNT: 0 - DEFERRED_COUNT: 0 -``` - -- **Removed `amdsmi_get_gpu_process_info` from python library** -amdsmi_get_gpu_process_info was removed from the C library in an earlier build, but the API was still in the python interface - - **Updated metrics --clocks** Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status. @@ -280,7 +280,7 @@ GPU: 0 ``` - **Updated `amd-smi topology --json` to align with host/guest** -Topology's `--json` output now is changed to align with output reported bt host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below. +Topology's `--json` output now is changed to align with output host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below. *Previous format:*