diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 873aa0426e..ce8076f279 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -8,7 +8,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Added -- **Added the following API's to amdsmi_interface.py**. +- **Added the following C API's to amdsmi_interface.py**. - amdsmi_get_cpu_handle() - amdsmi_get_esmi_err_msg() - amdsmi_get_gpu_event_notification() @@ -24,6 +24,25 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - The entry `policies` is added to the end of the dictionary to match API definition. - The entry `plpds` is marked for deprecation as it has the same information as `policies`. +- **Added pcie levels to `amd-smi static --bus` command**. + - The static --bus option has been updated to include the range of pcie levels that one may set a device to. + - Levels are a 2-tuple composed of the PCIE speed and bandwidth. + + ```console + $ amd-smi static --bus + GPU: 0 + BUS: + BDF: 0000:43:00.0 + MAX_PCIE_WIDTH: 16 + MAX_PCIE_SPEED: 16 GT/s + PCIE_LEVELS: + 0: (2.5 GT/s, 1) + 1: (5.0 GT/s, 4) + 2: (16.0 GT/s, 16) + PCIE_INTERFACE_VERSION: Gen 4 + SLOT_TYPE: CEM + ``` + - **Added evicted_time metric for kfd processes**. - Time that queues are evicted on a GPU in milliseconds - Added to CLI in `amd-smi monitor -q` and `amd-smi process` @@ -1219,9 +1238,9 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to GPU: 0 CLK_LEVEL: Successfully changed sclk perf level(s) to 5, 6 - GPU: 1 - CLK_LEVEL: level(s) 5, 6 is/are greater than performance levels supported for device - ``` +GPU: 1 + CLK_LEVEL: clock level(s) 5, 6 is/are greater than sclk frequency levels supported for device GPU ID: 1 BDF:0000:46:00.0 +``` - **Added new command `amd-smi static -C/--clock`**. - This new command displays the clock frequency performance levels for the selected GPUs and clocks. diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index daf08577fa..0d3464edec 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -428,6 +428,7 @@ class AMDSMICommands(): args.partition = partition if clock: args.clock = clock + # args.clock defaults to False so if it was overwritten to empty list, that indicates that it was given as an arguments but with an empty list if args.clock == []: args.clock = True @@ -534,6 +535,7 @@ class AMDSMICommands(): 'bdf': "N/A", 'max_pcie_width': "N/A", 'max_pcie_speed': "N/A", + 'pcie_levels': "N/A", 'pcie_interface_version': "N/A", 'slot_type': "N/A" } @@ -572,6 +574,21 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + pcie_info = amdsmi_interface.amdsmi_get_gpu_pci_bandwidth(args.gpu) + num_supported = pcie_info['transfer_rate']['num_supported'] + if num_supported != 0: + bus_info['pcie_levels'] = {} + for level in range(0, num_supported): + speed = str(self.helpers.convert_SI_unit(float(pcie_info['transfer_rate']['frequency'][level]), AMDSMIHelpers.SI_Unit.NANO)) + " GT/s" + width = pcie_info['lanes'][level] + level_values = (speed, width) + bus_info['pcie_levels'].update({level: level_values}) + else: + bus_info['pcie_levels'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pci bandwidth info for gpu %s | %s", gpu_id, e.get_error_info()) + static_dict['bus'] = bus_info if args.vbios: try: @@ -1018,7 +1035,6 @@ class AMDSMICommands(): logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['cache_info'] = cache_info_list - # default to printing all clocks, if in current_platform_args; otherwise print specific clocks if 'clock' in current_platform_args and (args.clock == True or isinstance(args.clock, list)): original_clock_args = args.clock #save original args.clock value, so we can reset for multiple devices @@ -1056,8 +1072,15 @@ class AMDSMICommands(): try: frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, clk_type_conversion) + # some clocks may have a sysfs file but no frequencies for whatever reason. + if len(frequencies['frequency']) == 0: + freq_dict = "N/A" + continue freq_dict = {} - freq_dict.update({'current level':frequencies['current']}) + current_level = frequencies['current'] + freq_dict.update({'current_level':current_level}) + current_frequency = str(self.helpers.convert_SI_unit(frequencies['frequency'][current_level], AMDSMIHelpers.SI_Unit.MICRO)) + "MHz" + freq_dict.update({'current_frequency':current_frequency}) freq_dict.update({'frequency_levels':{}}) if frequencies["num_supported"] != 0: for level in range(len(frequencies['frequency'])): @@ -1070,6 +1093,7 @@ class AMDSMICommands(): freq_dict = "N/A" except amdsmi_exception.AmdSmiLibraryException as e: freq_dict = "N/A" + logging.debug("Failed to get clock info for gpu %s | %s", gpu_id, e.get_error_info()) clk_dict[clk] = freq_dict static_dict['clock'] = clk_dict @@ -4563,6 +4587,7 @@ class AMDSMICommands(): args.power_cap is not None, args.soc_pstate is not None, args.xgmi_plpd is not None, + args.pcie is not None, args.clk_level is not None, args.clk_limit is not None, args.process_isolation is not None]): @@ -5087,7 +5112,7 @@ class AMDSMICommands(): gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", "memory_partition", "power_cap", "soc_pstate", "xgmi_plpd", - "process_isolation", "clk_limit", "clk_level"] + "process_isolation", "clk_limit", "clk_level", "pcie"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 85a9533aab..4929049f78 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1233,7 +1233,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_soc_pstate_help = f"Set the GPU soc pstate policy using policy id, an integer. Valid id's include:\n\t{soc_pstate_help_info}" xgmi_plpd_help_info = ", ".join(self.helpers.get_xgmi_plpd_policies()) set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}" - set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels." + set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels.\n\tUse `amd-smi static --bus` to find acceptable pcie levels." power_cap_min, power_cap_max = self.helpers.get_power_caps() if power_cap_max != "N/A": power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO) @@ -1284,7 +1284,8 @@ class AMDSMIParser(argparse.ArgumentParser): if self.helpers.is_baremetal(): set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID') set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID') - set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'FREQ_LEVELS')) + set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'PERF_LEVELS')) + set_value_exclusive_group.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE')) set_value_exclusive_group.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=lambda value: self._not_negative_int(value, '--process-isolation'), required=False, help=set_process_isolation_help, metavar='STATUS')